##  $ NOMBRE: ANGIE\ ESCOBEDO\ MESCCO $



## PREPROCESAMIENTOS IMPLEMENTACION

In [1]:
!pip install pyspark==3.0.1 py4j==0.10.9

Collecting pyspark==3.0.1
  Downloading pyspark-3.0.1.tar.gz (204.2 MB)
[K     |████████████████████████████████| 204.2 MB 32 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 62.3 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612246 sha256=23a79e96aaf1f691ded51655c60d18da4f823c781b29f6ae11f3235240becfe4
  Stored in directory: /root/.cache/pip/wheels/5e/34/fa/b37b5cef503fc5148b478b2495043ba61b079120b7ff379f9b
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1


In [2]:
from pyspark import SparkContext
sc = SparkContext()

## 1. *ESCALONAMIENTO*

In [4]:
def Escalonamiento(x):
  """Escalona los elementos de un RDD

  Args:
    X(RDD NUMERIC: array RDD numeric)
  Returns
    RDD: Vector escalado
  """
  # Obtenemos el minimo y mayor valor del RDD
  Xmin = x.min()
  Xmaxi = x.max()
  dato = x.map(lambda x: (x-Xmin)/(Xmaxi-Xmin))
  return dato

In [5]:
Vector_A = sc.parallelize(range(15,25),3)
Vector = Escalonamiento(Vector_A)
print(Vector.collect())

[0.0, 0.1111111111111111, 0.2222222222222222, 0.3333333333333333, 0.4444444444444444, 0.5555555555555556, 0.6666666666666666, 0.7777777777777778, 0.8888888888888888, 1.0]


## *2. ESTANDARIZACION*

In [6]:
def Estandarizacion(x):
  """Estandariza los elementos de un RDD

  Args:
    X(RDD NUMERIC: array RDD numeric)
  Returns
    RDD: Vector estandarizado
  """
  # Obtenemos la media y la desviacion estandar de valor del RDD
  X_media = x.mean()
  X_des = x.stdev()
  dato = x.map(lambda x: (x-X_media)/X_des)
  return dato

In [7]:
Vector_B = sc.parallelize(range(15,25),3)
Vector1 = Estandarizacion(Vector_B)
print(Vector1.collect())

[-1.5666989036012806, -1.2185435916898848, -0.8703882797784892, -0.5222329678670935, -0.17407765595569785, 0.17407765595569785, 0.5222329678670935, 0.8703882797784892, 1.2185435916898848, 1.5666989036012806]


## *3. NORMALIZACION*

In [10]:
import math
def Normalizacion(x):
  """Normalizar los elementos de un RDD

  Args:
    X(RDD NUMERIC: array RDD numeric)
  Returns
    RDD: Vector normalizado
  """
  lista_cuadrada = x.map(lambda x: x*x)
  sum_total = lista_cuadrada.sum() #Hallamos la suma
  valor = math.sqrt(sum_total) # Hallamos raiz de la suma
  dato = x.map(lambda x: (x/valor))
  return dato

In [11]:
Vector_C = sc.parallelize(range(15,25),3)
Vector2 = Normalizacion(Vector_C)
print(Vector2.collect())

[0.24065547555594474, 0.25669917392634106, 0.2727428722967374, 0.2887865706671337, 0.30483026903753, 0.3208739674079263, 0.3369176657783226, 0.3529613641487189, 0.36900506251911525, 0.38504876088951157]


## *4. SIMILITUD DE COSENOS*

In [34]:

def Sim_cosenos(x,y):
    """similitud de cosenos los elementos de un RDD

    Args:
      X(RDD NUMERIC: array RDD numeric)
    Returns
      RDD: Vector similitud
    """
    def puntos(x,y):
      #a= x.zip(y).map(lambda x: x[0]*x[1]).reduce(lambda x,y:x+y)
      return x.zip(y).map(lambda x: x[0]*x[1]).reduce(lambda x,y:x+y)
    arriba = puntos(x,y)
    abajo = puntos(x,x)*puntos(y,y)
    return arriba/abajo

In [35]:
x = sc.parallelize(range(0,25),4)
y = sc.parallelize(range(50,75),4)
print(Sim_cosenos(x,y))

4.1696349997904704e-05


## 5. BINARIO

In [36]:
def Binario(corpus):

  words = map(lambda x: (x,1),corpus)
  return list(words)

In [38]:
Corpus = [["balon","messi","ronaldo", "ramos","ramos"],["futbol","messi","ronaldo", "futbol","balon"],["ronaldo","messi","futbol", "cancha","cancha"]]
dato=Corpus[0]
print(Binario(dato))

Vector_D = sc.parallelize(Corpus,3)
Vector3 = Vector_D.map(lambda x: Binario(x))
print()
print(Vector3.collect())

[('balon', 1), ('messi', 1), ('ronaldo', 1), ('ramos', 1), ('ramos', 1)]

[[('balon', 1), ('messi', 1), ('ronaldo', 1), ('ramos', 1), ('ramos', 1)], [('futbol', 1), ('messi', 1), ('ronaldo', 1), ('futbol', 1), ('balon', 1)], [('ronaldo', 1), ('messi', 1), ('futbol', 1), ('cancha', 1), ('cancha', 1)]]
