In [1]:
## Analisis de las transacciones a partir de dos archivos

In [2]:
## Eliminacion y creacion de las carpetas

In [3]:
!rm -rf input output
!mkdir input

In [4]:
## Creacion de los archivos fuente
##   transacciones:
##     Id Transaccion
##     Id Producto
##     Id Usuario
##     Valor Venta Producto
##     Nombre Producto
##   usaurios:
##     Id Usuario
##     Email Usuario
##     Lenguaje
##     Pais

In [5]:
%%writefile input/transacciones.txt
1	1	1	300	a jumper
2	1	2	300	a jumper
3	1	2	300	a jumper
4	2	3	100	a rubber chicken
5	1	3	300	a jumper

Writing input/transacciones.txt


In [6]:
%%writefile input/usuarios.txt
1	matthew@test.com	EN	US
2	matthew@test2.com	EN	GB
3	matthew@test3.com	FR	FR

Writing input/usuarios.txt


In [7]:
## Problema a resolver
##   Se desea tener una tabla que indique el codigo del producto y el pais del cliente que lo adquirio

In [8]:
## Paso 1:
##   Se crea un Mapper que una la información de los archivos en un unico flujo de datos

In [9]:
%%writefile joinMapperTU.py
#!/usr/bin/env python3
import sys
if __name__ == "__main__":
	for line in sys.stdin:
		# Setting some defaults
		user_id = ""
		product_id = "-"
		location = "-"

		line = line.strip()
		splits = line.split("\t")
		#print(str(len(splits)) + ' :: ' + line)
		if len(splits) != 4: # Transactions have more columns than users
			user_id = splits[2]
			product_id = splits[1]
		else:
			user_id = splits[0]
			location = splits[3] 
		print(user_id + '\t' + product_id + '\t' + location)


Writing joinMapperTU.py


In [10]:
## Se le da permisos de ejecución

In [11]:
!chmod +x joinMapperTU.py

In [12]:
## Se valida la ejecución y resultado

In [13]:
!cat ./input/*.txt | ./joinMapperTU.py 

1	1	-
2	1	-
2	1	-
3	2	-
3	1	-
1	-	US
2	-	GB
3	-	FR


In [14]:
## Notese que no es posible agruparlas porque los datos no estan ordenados, al ordenarlos sucede...

In [15]:
!cat ./input/*.txt | ./joinMapperTU.py | sort

1	-	US
1	1	-
2	-	GB
2	1	-
2	1	-
3	-	FR
3	1	-
3	2	-


In [16]:
## Paso 2
##   Se crea el reducer que permite agrupar la información

In [17]:
%%writefile joinReducerTU.py
#!/usr/bin/env python3
import sys
import string

if __name__ == "__main__":
	last_user_id = None
	cur_location = "-"

	for line in sys.stdin:
		line = line.strip()
		user_id,product_id,location = line.split("\t")

		if not last_user_id or last_user_id != user_id:
			last_user_id = user_id
			cur_location = location
		elif user_id == last_user_id:
			location = cur_location
			print(product_id + '\t' + location)

Writing joinReducerTU.py


In [18]:
!chmod +x joinReducerTU.py	

In [19]:
## Analizamos el resultado

In [20]:
cat ./input/*.txt | ./joinMapperTU.py | sort | ./joinReducerTU.py | sort

1	FR
1	GB
1	GB
1	US
2	FR


In [21]:
## Limpiamos la maquina local

In [22]:
!rm joinMapperTU.py joinReducerTU.py

In [23]:
## Problema
##   Se desea conocer el total de productos adquiridos por pais de ubicación del Cliente
##
##
##
##
##
##
##

In [24]:
%%writefile joinMapperTU.py
#!/usr/bin/env python3
import sys
if __name__ == "__main__":
	for line in sys.stdin:
		# Setting some defaults
		user_id = ""
		product_id = "-"
		location = "-"

		line = line.strip()
		splits = line.split("\t")
		#print("line " + len(splits) + ' :: ' + line)
		if len(splits) != 4: # Transactions have more columns than users
			user_id = splits[2]
			product_id = splits[1]
		else:
			user_id = splits[0]
			location = splits[3] 
		print(user_id + '\t' + product_id + '\t' + location)

Writing joinMapperTU.py


In [25]:
!chmod +x joinMapperTU.py

In [26]:
!cat ./input/*.txt | ./joinMapperTU.py | sort

1	-	US
1	1	-
2	-	GB
2	1	-
2	1	-
3	-	FR
3	1	-
3	2	-


In [27]:
!cat ./input/*.txt | ./joinMapperTU.py | sort -r

3	2	-
3	1	-
3	-	FR
2	1	-
2	1	-
2	-	GB
1	1	-
1	-	US


In [28]:
%%writefile joinSuffleTU.py
#!/usr/bin/env python3
import sys
import string

if __name__ == "__main__":
	last_user_id = None
	cur_location = "-"

	for line in sys.stdin:
		line = line.strip()
		user_id,product_id,location = line.split("\t")

		if not last_user_id or last_user_id != user_id:
			last_user_id = user_id
			cur_location = location
		elif user_id == last_user_id:
			location = cur_location
			print(product_id + '\t' + location)

Writing joinSuffleTU.py


In [29]:
!chmod +x joinSuffleTU.py

In [30]:
cat ./input/*.txt | ./joinMapperTU.py | sort | ./joinSuffleTU.py | sort

1	FR
1	GB
1	GB
1	US
2	FR


In [31]:
cat ./input/*.txt | ./joinMapperTU.py | sort | ./joinSuffleTU.py | sort -k 2

1	FR
2	FR
1	GB
1	GB
1	US


In [32]:
%%writefile joinReduceTU.py
#!/usr/bin/env python3
import sys
import string

if __name__ == "__main__":
	totalLoc = 0
	lastLoc = ''
	
	for line in sys.stdin:
		line = line.strip()
		totalP,Location = line.split("\t") 

		if lastLoc == '':
			totalLoc = int(1)
			lastLoc = Location
		elif lastLoc != Location:
			print(str(totalLoc) + '\t' + lastLoc)
			totalLoc = int(1)
			lastLoc = Location
		else:
			totalLoc = totalLoc + int(1)
	print(str(totalLoc) + '\t' + lastLoc)

Writing joinReduceTU.py


In [33]:
!chmod +x joinSuffleTU.py

In [34]:
cat ./input/*.txt | ./joinMapperTU.py | sort | ./joinSuffleTU.py | sort | joinReduceTU.py

/bin/sh: 1: joinReduceTU.py: not found
sort: fflush failed: 'standard output': Broken pipe
sort: write error


In [35]:
cat ./input/*.txt | ./joinMapperTU.py | sort | ./joinSuffleTU.py | sort -k 2 | ./joinReduceTU.py

2	FR
2	GB
1	US


In [36]:
## Limpieza

In [37]:
!rm joinMapperTU.py joinSuffleTU.py joinReduceTU.py