## Introducción

En este notebook se realizarán las labores de machine learning para la generación de modelos. Para ello, se hará primero un procesado de los zipcodes y una limpieza de variables, para finalmente generar los modelos utilizando diversos algoritmos.

## Para montar el drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/AnalisisDeDatos/PracticaFinal/

Mounted at /content/drive
/content/drive/MyDrive/AnalisisDeDatos/PracticaFinal


## Dependencias y utilidades

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn_pandas import CategoricalImputer
from pandas_profiling import ProfileReport
import IPython
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
import itertools
from sklearn.model_selection import KFold
from sklearn import metrics

## Procesado de Zipcodes

In [None]:
restaurants = pd.read_csv('data/business/segmentation/eeuu/with_attributes/business_filtered_eeuu_categorized.csv', sep=",")
restaurants

business_general = pd.read_csv('data/business/business_data.csv', sep=",")


Una vez se ha cargado nuestro dataframe con los restaurantes y el dataframe original de negocios, se procede a realizar un merge añadiendo la latitud y la longitud de los restaurantes.

In [None]:
restaurants = pd.merge(restaurants, business_general[["business_id","lat","long"]], on="business_id")
restaurants

In [None]:
import requests
def find_zipcode_by_coords (row):
  r = requests.get(f"https://api.geoapify.com/v1/geocode/reverse?lat={row.lat}&lon={row.long}&format=json&apiKey=1b48259b810e48ddb151889f9ea58db0")
  response = r.json()
  if 'results' in response and 'postcode' in response['results'][0]:
    return response['results'][0]['postcode']

b = restaurants.copy()
b['zipcode'] = b.apply(find_zipcode_by_coords, axis='columns')

b.to_csv('data/business/segmentation/eeuu/restaurants_zipcodes.csv', index=False)
b

El código de arriba hace uso de la API proporcionada por Geoapify que permite obtener la información de los zipcodes de los restaurantes a partir de la longitud y latitud de los mismos.

In [None]:
b = pd.read_csv('data/business/segmentation/eeuu/restaurants_zipcodes.csv', sep="," )

In [None]:
for i,row in restaurants.iterrows():
  if b.iloc[i].zipcode:
    if len(b.iloc[i].zipcode) == 5:
      restaurants.at[i,"zipcode"] = int(b.iloc[i].zipcode)



restaurants.to_csv('data/business/segmentation/eeuu/restaurants_zipcodes_CORREGIDOS.csv', index=False)
restaurants

Por último, realizamos un cast de los zipcodes a tipo entero.

# Aprendizaje Automático

In [None]:
#Leemos el dataframe de los restaurantes
restaurants = pd.read_csv('data/business/segmentation/eeuu/zipcodes/restaurants_zipcodes_CORREGIDOS.csv', sep=",")
restaurants

NameError: ignored

### Limpieza y discretización

En las secciones de código posteriores se definen las funciones que realizarán las distintas discretizaciones de los datos, así como una limpieza de los mismo eliminando variables innecesarias.

In [None]:

#Tres clases
#Discretización de rating
#def discretize(row):
 #  if row.rating < 3:
  #   row.rating = "Bajo"
  # elif row.rating >= 3 and row.rating <4:
   #  row.rating = "Medio"
  # elif row.rating >= 4:
   #  row.rating = "Alto"
  # return row.rating

#Dos clases
def discretize(row):
  if row.rating <= 3.5:
    row.rating = "Bajo"
  elif row.rating > 3.5:
    row.rating = "Alto"
  return row.rating

def discretize_rev(row):
  if row.num_reviews <= 10:
    row.num_reviews = "Bajas"
  elif row.num_reviews > 10 and row.num_reviews <=50:
    row.num_reviews = "Medias"
  elif row.num_reviews > 50 and row.num_reviews <= 200:
    row.num_reviews = "Altas"
  elif row.num_reviews > 200:
    row.num_reviews = "Muy Altas"
  return row.num_reviews



In [None]:
restaurants['rating'] = restaurants.apply(discretize,axis='columns')
restaurants['num_reviews'] = restaurants.apply(discretize_rev,axis='columns')
restaurants

In [None]:
restaurants = restaurants.drop(columns=['business_id','categories','city','open', 'lat', 'long','Caters'])

In [None]:
restaurants = restaurants.replace("None", None)

In [None]:
restaurants.to_csv('data/business/segmentation/eeuu/AA/restaurants.csv', index=False)

### Separación por zonas

In [None]:
restaurants = pd.read_csv('data/business/segmentation/eeuu/AA/restaurants.csv', sep=",")
restaurants

In [None]:
restaurants_AZ = restaurants[(restaurants.zipcode >= 85001) & (restaurants.zipcode <= 86556)]
restaurants_NV = restaurants[(restaurants.zipcode >= 88901) & (restaurants.zipcode <= 89883)]
restaurants_WI = restaurants[(restaurants.zipcode >= 53001) & (restaurants.zipcode <= 54990)]
restaurants_IL = restaurants[(restaurants.zipcode >= 60001) & (restaurants.zipcode <= 62999)]
restaurants_NC = restaurants[(restaurants.zipcode >= 27006) & (restaurants.zipcode <= 28909)]
restaurants_PA = restaurants[(restaurants.zipcode >= 15001) & (restaurants.zipcode <= 19640)]
restaurants_OH = restaurants[(restaurants.zipcode >= 43001) & (restaurants.zipcode <= 45999)]

Finalmente, vamos a añadir una nueva columna 'zona' dentro de cada dataframe

In [None]:
#Añadimos la columna
restaurants_AZ['zone'] = None

#Y la rellenamos con su correspodiente zona
restaurants_AZ.loc[(restaurants_AZ.zipcode >= 85300) & (restaurants_AZ.zipcode <= 85400) & (restaurants_AZ.zipcode != 85331) & (restaurants_AZ.zipcode != 85377) & (restaurants_AZ.zipcode != 85327),'zone'] = 'Oeste'
restaurants_AZ.loc[((restaurants_AZ.zipcode >= 85001) & (restaurants_AZ.zipcode <= 85100)) | (restaurants_AZ.zipcode == 85331) | (restaurants_AZ.zipcode == 85377) | (restaurants_AZ.zipcode == 85327) | (restaurants_AZ.zipcode == 86260),'zone'] = 'Centro'
restaurants_AZ.loc[((restaurants_AZ.zipcode >= 85101) & (restaurants_AZ.zipcode <= 85299)) | (restaurants_AZ.zipcode == 85412),'zone'] = 'Este'

restaurants_AZ.to_csv('data/business/segmentation/eeuu/AA/restaurants_AZ.csv', index=False)

In [None]:
#Repetimos con todos los estados:
restaurants_NV['zone'] = None

#zipcodes por zonas
zip_north_NV = [89134,89128,89129,89124,89166,89143,89131,89130, 89108,89196,89032,89030,89031,89084,89085,89086,89087,89081,89115,89156,89149,89191,89136]
zip_center_NV = [89138, 89144, 89145, 89107, 89106, 89101, 89110, 89135, 89117, 89147, 89146, 89103, 89102, 89109, 89169, 89121, 89104, 89142, 89122, 89158,89154,89152]
zip_south_NV = [89161,89004,89124,89054,89148,89113,89118,89119,89178,89179,89139,89141,89123,89183,89044,89052,89120,89011,89074,89012,89015,89002,89005,89124,89193,89014,89105,89016,89159,89111,89165,89199]

restaurants_NV.loc[restaurants['zipcode'].isin(zip_north_NV),'zone'] = 'Norte'
restaurants_NV.loc[restaurants['zipcode'].isin(zip_center_NV),'zone'] = 'Centro'
restaurants_NV.loc[restaurants['zipcode'].isin(zip_south_NV),'zone'] = 'Sur'

restaurants_NV.to_csv('data/business/segmentation/eeuu/AA/restaurants_NV.csv', index=False)

In [None]:
#Repetimos con todos los estados:
restaurants_WI['zone'] = None

#zipcodes por zonas
zip_west_WI = [53572,53515,53583,53528,53529,53593,53508,53719,53717,54562,53562]
zip_center_WI = [53711,56705,53597,53726,53715,53597,56706,56713,56716,56703,53575,53716,53703,53713,53701,53706,53705,53792,54704,53521,53725]
zip_east_WI = [53708,53704,53714,56716,53718,53527,53598,53590,53559,53783,53589,53558,53531,53532,53571,53596]

restaurants_WI.loc[restaurants['zipcode'].isin(zip_west_WI),'zone'] = 'Oeste'
restaurants_WI.loc[restaurants['zipcode'].isin(zip_center_WI),'zone'] = 'Centro'
restaurants_WI.loc[restaurants['zipcode'].isin(zip_east_WI),'zone'] = 'Este'

restaurants_WI.to_csv('data/business/segmentation/eeuu/AA/restaurants_WI.csv', index=False)

In [None]:
#Repetimos con todos los estados:
restaurants_NC['zone'] = None

#zipcodes por zonas
zip_north_NC= [28031, 28036, 28078,28216, 28214 , 28269, 28262, 28117, 28115, 28037,28080,28165,28117,28115,28027,28081,28082,28083,28075,28025,28262,28034,28021,28164,28120,28037,28087,28223]
zip_center_NC = [28208,28206,  28213,28215, 28205, 28202, 28204, 28203, 28209, 28207, 28212, 28227, 28211, 28212, 28217,28012,28130,28208,28214,28012,28101,28032,28056,28054,28052,28506,28097,28126,28107,28098,28282,28244,28273]
#zip_south_NC = not in zip_north_NC and not in zip_center_NC and in range(27006, 28909)

restaurants_NC.loc[restaurants['zipcode'].isin(zip_north_NC),'zone'] = 'Norte'
restaurants_NC.loc[restaurants['zipcode'].isin(zip_center_NC),'zone'] = 'Centro'
restaurants_NC.loc[~restaurants['zipcode'].isin(zip_north_NC) & ~restaurants['zipcode'].isin(zip_center_NC) & (restaurants.zipcode >= 27006) & (restaurants.zipcode <= 28909),'zone'] = 'Sur'

restaurants_NC.to_csv('data/business/segmentation/eeuu/AA/restaurants_NC.csv', index=False)

In [None]:
#Repetimos con todos los estados:
restaurants_IL['zone'] = None

#zipcodes por zonas
zip_center_IL = range(60000,62999)

restaurants_IL.loc[restaurants['zipcode'].isin(zip_center_IL),'zone'] = 'Centro'

restaurants_IL.to_csv('data/business/segmentation/eeuu/AA/restaurants_IL.csv', index=False)

In [None]:
#Repetimos con todos los estados:
restaurants_PA['zone'] = None

#zipcodes por zonas
zip_north_PA = [15003,15056,15108,15143,15225,15237,15090,15015,15101,15044,15076,15229,15202,15209,15223,15116,15238,15024,15076,15084,15049,15030,15116,15139,15147,15144,15068,15215,15046,15091,15042,15007]
zip_center_PA = [15231,15126,15275,15071,15205,15244,15136,15205,15220,15211,15210,15203,15213,15219,15222,15121,15233,15212,15214,15262,15224,15203,15207,15221,15232,15206,15224,15201,15206,15217,15207,15218,15145,15235,15239,15146,15668,15290,15240,15260,15208]
#zip_south_PA = not in zip_north_PA and not in zip_center_PA and in range(15001, 19640)

restaurants_PA.loc[restaurants['zipcode'].isin(zip_north_PA),'zone'] = 'Norte'
restaurants_PA.loc[restaurants['zipcode'].isin(zip_center_PA),'zone'] = 'Centro'
restaurants_PA.loc[~restaurants['zipcode'].isin(zip_north_PA) & ~restaurants['zipcode'].isin(zip_center_PA) & (restaurants.zipcode >= 15001) & (restaurants.zipcode <= 19640),'zone'] = 'Sur'

restaurants_PA.to_csv('data/business/segmentation/eeuu/AA/restaurants_PA.csv', index=False)

In [None]:
#Repetimos con todos los estados:
restaurants_OH['zone'] = None

#zipcodes por zonas
zip_west_OH = [44140,44145, 44070, 44138, 44116, 44126, 44135,44133 ,44142, 44017, 44130, 44149, 44136, 44107, 44111,44060,44237,44053,44035,44280,44001,44256,44039,44012,44050,44011,44052,44028,44054,44044,44055,44253]
zip_center_OH = [44102, 44144, 44129, 44147, 44141, 44113, 44109, 44134, 44131, 44114, 44103, 44108, 44115, 44104, 44106, 44127, 44105, 44120, 44128, 44137, 44146, 44125,44223,44333,44310,44067,44281,44203,44320,44321,44199,44264,44313,44195,44233,44286,44210,44314,44307,44101]
zip_east_OH = [44110, 44119, 44123, 44132, 44117, 44143, 44040, 44112, 44121, 44118, 44124, 44022, 44122, 44139,44212,44240,44224,44094,44021,44024,44241,44095,44092,44056,44026,44221,44062,44023,44255,44202,44087,44278,44242,44045,44236,44077,44234,44099,44262,44326,44065,44046,44266,44081,44491]

restaurants_OH.loc[restaurants['zipcode'].isin(zip_west_OH),'zone'] = 'Oeste'
restaurants_OH.loc[restaurants['zipcode'].isin(zip_center_OH),'zone'] = 'Centro'
restaurants_OH.loc[restaurants['zipcode'].isin(zip_east_OH),'zone'] = 'Este'

restaurants_OH.to_csv('data/business/segmentation/eeuu/AA/restaurants_OH.csv', index=False)

In [None]:
from sklearn.preprocessing import LabelEncoder
restaurants_states = [restaurants_AZ,restaurants_NV,restaurants_WI,restaurants_IL,restaurants_NC,restaurants_PA,restaurants_OH]
categorical_cols = ['num_reviews','rating','GoodForKids','NoiseLevel','RestaurantsDelivery','WiFi'
,'RestaurantsGoodForGroups','OutdoorSeating', 'HasTV','RestaurantsReservations','zone']
for r in restaurants_states:
  le = LabelEncoder()
  r[categorical_cols] = r[categorical_cols].apply(lambda col: le.fit_transform(col))

restaurants_AZ.to_csv('data/business/segmentation/eeuu/AA/restaurants_AZ.csv', index=False)
restaurants_NV.to_csv('data/business/segmentation/eeuu/AA/restaurants_NV.csv', index=False)
restaurants_WI.to_csv('data/business/segmentation/eeuu/AA/restaurants_WI.csv', index=False)
restaurants_IL.to_csv('data/business/segmentation/eeuu/AA/restaurants_IL.csv', index=False)
restaurants_NC.to_csv('data/business/segmentation/eeuu/AA/restaurants_NC.csv', index=False)
restaurants_PA.to_csv('data/business/segmentation/eeuu/AA/restaurants_PA.csv', index=False)
restaurants_OH.to_csv('data/business/segmentation/eeuu/AA/restaurants_OH.csv', index=False)

### Generación de modelos

In [None]:
restaurants_AZ = pd.read_csv('data/business/segmentation/eeuu/AA/restaurants_AZ.csv',sep = ",")
restaurants_NV = pd.read_csv('data/business/segmentation/eeuu/AA/restaurants_NV.csv',sep = ",")
restaurants_WI = pd.read_csv('data/business/segmentation/eeuu/AA/restaurants_WI.csv',sep = ",")
restaurants_IL = pd.read_csv('data/business/segmentation/eeuu/AA/restaurants_IL.csv',sep = ",")
restaurants_NC = pd.read_csv('data/business/segmentation/eeuu/AA/restaurants_NC.csv',sep = ",")
restaurants_PA = pd.read_csv('data/business/segmentation/eeuu/AA/restaurants_PA.csv',sep = ",")
restaurants_OH = pd.read_csv('data/business/segmentation/eeuu/AA/restaurants_OH.csv',sep = ",")

In [None]:
restaurants_AZ = restaurants_AZ.drop(columns=['zipcode'])
restaurants_NV = restaurants_NV.drop(columns=['zipcode'])
restaurants_WI = restaurants_WI.drop(columns=['zipcode'])
restaurants_IL = restaurants_IL.drop(columns=['zipcode'])
restaurants_NC = restaurants_NC.drop(columns=['zipcode'])
restaurants_PA = restaurants_PA.drop(columns=['zipcode'])
restaurants_OH = restaurants_OH.drop(columns=['zipcode'])

In [None]:
restaurants_states = [restaurants_AZ,restaurants_NV,restaurants_WI,restaurants_IL,restaurants_NC,restaurants_PA,restaurants_OH]


El código siguiente implementa un evaluador automático basado en Cross Validation para diversos algoritmos.

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
results_states = []

# Se definen los clasificadores a utilizar: Arboles de decisión, KNN, RandomForest, AdaBoost, Multilayer-Perceptron y NaiveBayes
classifiers = {
     'Decision Tree': (DecisionTreeClassifier, {
         'criterion': ['gini', 'entropy'],
         'splitter': ['best', 'random'],
     }),
     'K-Nearest Neighbors': (KNeighborsClassifier, {
         'n_neighbors': [5, 10, 15],
         'p': [1, 2],
     }),
     'RandomForestClassifier': (RandomForestClassifier, {
          'criterion': ['gini', 'entropy'],
     }),
     'AdaBoost': (AdaBoostClassifier, {
         'n_estimators':[5,30,50,100],
    }),
    'Multilayer-Perceptron': (MLPClassifier, {
       'learning_rate':['adaptive'],
       'hidden_layer_sizes':[5, 10, (5, 10)],
    }),
    # 'Naive Bayes': (GaussianNB, {

    # })
}

for r in restaurants_states:
  X = r[['num_reviews','GoodForKids','NoiseLevel','RestaurantsDelivery','WiFi'
  ,'RestaurantsGoodForGroups','OutdoorSeating','RestaurantsReservations','RestaurantsPriceRange2','zone']]
  y = r['rating']

  # Se toman los valores de x e y
  Xv, yv = X.values, y.values

  # Se inicializan las folds para la validación cruzada. De esta forma utilizaremos un conjunt de validación
  cv = KFold(n_splits=5, shuffle=True)

  # Se almacenarán los datos en la variable results
  results = []

  # Se itera según las folds y los datos
  for cv_fold, (train_ix, test_ix) in enumerate(cv.split(Xv, yv)):
    X_train, X_test = Xv[train_ix], Xv[test_ix]
    y_train, y_test = yv[train_ix], yv[test_ix]

    #Se balancean los datos por undersample
    under_sampler = RandomUnderSampler(random_state=42)
    X_res, y_res = under_sampler.fit_resample(X_train, y_train)
    #Se muestran la cantidad nueva de valores para los conjuntos
    print(f"Training target statistics: {Counter(y_res)}")
    print(f"Testing target statistics: {Counter(y_test)}")

  # Para cada clasificador y los parámetros de clasificador, se toman los valores en un diccionario
    for clf_name, clf_info in classifiers.items():
      Clf, hyperparams = clf_info
      hp_ks, hp_vs = hyperparams.keys(), hyperparams.values()
      configs = [dict(zip(hp_ks, v)) for v in itertools.product(*hp_vs)]

      # Para cada configuración de cada clasificador, se genera el modelo y se guarda en results.
      for config in configs:
          clf = Clf(**config)
          print(clf)
          clf.fit(X_res, y_res)
          y_pred = clf.predict(X_test)
          results.append({
            'classifier': clf_name,
            'hyperparameters': str(config),
            'cv_fold': cv_fold,
            'accuracy': metrics.accuracy_score(y_test, y_pred),
            'precision': metrics.precision_score(y_test, y_pred, average='micro'),
            'recall': metrics.recall_score(y_test, y_pred, average='micro'),
            'f1': metrics.f1_score(y_test, y_pred, average='micro'),
            'model': clf
          })
          
  # Se guardan los resultados en un dataframe y se agrupan por nombre e hiperparámetros
  results_df = pd.DataFrame(results)
  results_states.append(results_df)


In [None]:
results_states[0].groupby(['classifier', 'hyperparameters']).mean()

Guardamos los resultados

In [None]:
# 3 CLASES
results_states[0].groupby(['classifier', 'hyperparameters']).mean().to_csv('data/business/segmentation/eeuu/AA/results/3classes/results-AZ.csv')
results_states[1].groupby(['classifier', 'hyperparameters']).mean().to_csv('data/business/segmentation/eeuu/AA/results/3classes/results-NV.csv')
results_states[2].groupby(['classifier', 'hyperparameters']).mean().to_csv('data/business/segmentation/eeuu/AA/results/3classes/results-WI.csv')
results_states[3].groupby(['classifier', 'hyperparameters']).mean().to_csv('data/business/segmentation/eeuu/AA/results/3classes/results-IL.csv')
results_states[4].groupby(['classifier', 'hyperparameters']).mean().to_csv('data/business/segmentation/eeuu/AA/results/3classes/results-NC.csv')
results_states[5].groupby(['classifier', 'hyperparameters']).mean().to_csv('data/business/segmentation/eeuu/AA/results/3classes/results-PA.csv')
results_states[6].groupby(['classifier', 'hyperparameters']).mean().to_csv('data/business/segmentation/eeuu/AA/results/3classes/results-OH.csv')

In [None]:
# 2 CLASES
results_states[0].groupby(['classifier', 'hyperparameters']).mean().to_csv('data/business/segmentation/eeuu/AA/results/2classes/results-AZ.csv')
results_states[1].groupby(['classifier', 'hyperparameters']).mean().to_csv('data/business/segmentation/eeuu/AA/results/2classes/results-NV.csv')
results_states[2].groupby(['classifier', 'hyperparameters']).mean().to_csv('data/business/segmentation/eeuu/AA/results/2classes/results-WI.csv')
results_states[3].groupby(['classifier', 'hyperparameters']).mean().to_csv('data/business/segmentation/eeuu/AA/results/2classes/results-IL.csv')
results_states[4].groupby(['classifier', 'hyperparameters']).mean().to_csv('data/business/segmentation/eeuu/AA/results/2classes/results-NC.csv')
results_states[5].groupby(['classifier', 'hyperparameters']).mean().to_csv('data/business/segmentation/eeuu/AA/results/2classes/results-PA.csv')
results_states[6].groupby(['classifier', 'hyperparameters']).mean().to_csv('data/business/segmentation/eeuu/AA/results/2classes/results-OH.csv')

Implementación manual de Decision Tree para ver la importancia de las variables.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn import tree
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

#Decision Tree para ver los pesos de los atributos

X_train, X_test = train_test_split(restaurants_AZ ,test_size = 0.2)

y_train = X_train['rating']
y_test = X_test['rating']
under_sampler = RandomUnderSampler(random_state=42)
X_res, y_res = under_sampler.fit_resample(X_train, y_train)
print(f"Training target statistics: {Counter(y_res)}")
print(f"Testing target statistics: {Counter(y_test)}")

del(X_res['rating'])
del(X_test['rating'])


dt = DecisionTreeClassifier()
dt = dt.fit(X_res,y_res)


y_pred = dt.predict(X_test)

acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:",acc)

#Y el f1:
f1 = metrics.f1_score(y_test, y_pred,average="macro")
conf = confusion_matrix(y_test, y_pred)
print(conf)
print("F1:",f1)

card_col = []

for i in range(0,12):
  if restaurants_AZ.columns[i] != "rating":
    card_col.append(restaurants_AZ.columns[i])
 

for i in range(len(card_col)):
      print('\033[1m' + card_col[i]+ "\033[0m \t\t: " + str(dt.feature_importances_[i]))


fig = plt.figure(figsize=(40,35))
_ = tree.plot_tree(dt, max_depth = 3,feature_names=restaurants_AZ.columns)