In [None]:
!pip install pandas-profiling==2.7.1

## Para montar el drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/AnalisisDeDatos/PracticaFinal/

/content/drive/MyDrive/AnalisisDeDatos/PracticaFinal


In [None]:
import pandas as pd
import json, ast
import IPython
from pandas_profiling import ProfileReport

## Agregado de atributos relevantes

#### EEUU

Carga de los dataframes

In [None]:
business_eeuu_categorized = pd.read_csv('data/business/segmentation/eeuu/business_filtered_eeuu_categorized.csv', sep=",")
business = pd.read_csv('data/business/business_data.csv', sep=",")

FileNotFoundError: ignored

In [None]:
business_eeuu_categorized

Añadimos la columna de atributos al dataframe original

In [None]:
business_aux = business[['business_id', 'attributes']]
business_eeuu_categorized = pd.merge(business_eeuu_categorized, business_aux, on="business_id")

In [None]:
business_eeuu_categorized

Eliminamos los NaN, convertimos la columna `attributes` a String y a su vez, parseamos la columna para convertir cada uno de los valores de los jsons en columnas.

In [None]:
business_eeuu_categorized = business_eeuu_categorized.fillna("{}")
business_eeuu_categorized["attributes"] = business_eeuu_categorized["attributes"].apply(ast.literal_eval)
attributes_normalized = pd.json_normalize(business_eeuu_categorized.attributes)
business_eeuu_categorized = business_eeuu_categorized.join(attributes_normalized, on=business_eeuu_categorized.index)

In [None]:
business_eeuu_categorized

Ya no es necesaria la columna `attributes`

In [None]:
business_eeuu_categorized = business_eeuu_categorized.drop(columns=['attributes'])

Sustituimos los NaN de las nuevas columnas por el String 'None'.

In [None]:
business_eeuu_categorized = business_eeuu_categorized.fillna(value='None')
# business_eeuu_categorized.replace(None, 'None')
business_eeuu_categorized

Aplicamos Profiling para comprobar que se hayan eliminado los NaN. Además, nos sirve para realizar un estudio sobre cuáles de las nuevas columnas serán útiles para nuestro proyecto

In [None]:
prof = ProfileReport(business_eeuu_categorized)
prof.to_file(output_file='analysis/business_eeuu_categorized_test.html')

In [None]:
IPython.display.HTML(filename='analysis/business_eeuu_categorized_test.html') 

Eliminamos las columnas que no necesitamos

In [None]:
business_eeuu_categorized = business_eeuu_categorized.drop(columns=['GoodForMeal', 'Alcohol', 'RestaurantsTakeOut', 'BusinessAcceptsCreditCards'
, 'Ambience', 'BusinessParking', 'RestaurantsTableService', 'BikeParking', 'RestaurantsAttire', 'BYOBCorkage', 'DriveThru', 'HappyHour'
,'BusinessAcceptsBitcoin', 'CoatCheck', 'Smoking', 'Music', 'BestNights', 'WheelchairAccessible', 'GoodForDancing', 'DogsAllowed'
, 'Corkage', 'BYOB', 'ByAppointmentOnly', 'AgesAllowed', 'DietaryRestrictions', 'HairSpecializesIn', 'Open24Hours', 'AcceptsInsurance'
,'RestaurantsCounterService'])

In [None]:
business_eeuu_categorized

Algunos valores de las columnas se han convertido incorrectamente en formato Unicode. Simplemente eliminamos las comillas y el caracter 'u' y ya tenemos el dataframe listo.

In [None]:
business_eeuu_categorized['NoiseLevel'] = business_eeuu_categorized['NoiseLevel'].str.strip("u").str.strip("\'")
business_eeuu_categorized['WiFi'] = business_eeuu_categorized['WiFi'].str.strip("u").str.strip("\'")

In [None]:
business_eeuu_categorized

In [None]:
business_eeuu_categorized.to_csv('data/business/segmentation/eeuu/with_attributes/business_filtered_eeuu_categorized.csv', index=False)

In [None]:
business_eeuu_categorized = pd.read_csv('data/business/segmentation/eeuu/with_attributes/business_filtered_eeuu_categorized.csv', sep=",")
business_eeuu_categorized

Unnamed: 0,business_id,city,num_reviews,open,rating,zipcode,GoodForKids,NoiseLevel,RestaurantsDelivery,Caters,WiFi,RestaurantsGoodForGroups,OutdoorSeating,HasTV,RestaurantsReservations,RestaurantsPriceRange2
0,gnKjwL_1w79qoiV3IC_xQQ,Charlotte,170.0,1.0,4.0,28210.0,True,average,False,False,no,True,False,True,True,2
1,1Dfx3zM-rW4n-31KeC8sJg,Phoenix,18.0,1.0,3.0,85016.0,True,,False,,no,True,False,False,False,1
2,fweCYi8FmbJXHCqLnwuk8w,Mentor-on-the-Lake,16.0,1.0,4.0,44060.0,True,,True,,,True,False,,False,2
3,PZ-LZzSlhSe9utkQYU8pFg,Las Vegas,40.0,0.0,4.0,89119.0,True,quiet,False,True,no,True,False,False,True,2
4,1RHY4K3BD22FK7Cfftn8Mg,Pittsburgh,35.0,1.0,4.0,15231.0,True,,False,,,True,False,False,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35821,cfrN6-lQC-dzjBtNBjefpQ,Kent,3.0,0.0,2.5,44240.0,True,,,,,,,,,2
35822,JsRt9LPgv-7guVcY4u6OQA,Huntersville,142.0,1.0,4.5,28078.0,True,average,False,True,no,True,True,False,False,2
35823,7wZgquJ30qkVQbvbJo92ow,Madison,6.0,1.0,3.5,53717.0,,,,,,,,,,
35824,ghovD5ZTGDQ5Q2U4ERddWw,Fairlawn,22.0,1.0,4.0,44333.0,True,loud,False,False,no,True,False,False,False,2


#### CANADA

Carga de los dataframes

In [None]:
business_canada_categorized = pd.read_csv('data/business/segmentation/canada/business_filtered_canada_categorized.csv', sep=",")
business = pd.read_csv('data/business/business_data.csv', sep=",")

In [None]:
business_canada_categorized

Añadimos la columna de atributos al dataframe original

In [None]:
business_aux = business[['business_id', 'attributes']]
business_canada_categorized = pd.merge(business_canada_categorized, business_aux, on="business_id")

In [None]:
business_canada_categorized

Eliminamos los NaN, convertimos la columna `attributes` a String y a su vez, parseamos la columna para convertir cada uno de los valores de los jsons en columnas.

In [None]:
business_canada_categorized = business_canada_categorized.fillna("{}")
business_canada_categorized["attributes"] = business_canada_categorized["attributes"].apply(ast.literal_eval)
attributes_normalized = pd.json_normalize(business_canada_categorized.attributes)
business_canada_categorized = business_canada_categorized.join(attributes_normalized, on=business_canada_categorized.index)

In [None]:
business_eeuu_categorized

Ya no es necesaria la columna `attributes`

In [None]:
business_canada_categorized = business_canada_categorized.drop(columns=['attributes'])

Sustituimos los NaN de las nuevas columnas por el String 'None'.

In [None]:
business_canada_categorized = business_canada_categorized.fillna(value='None')
business_canada_categorized

Aplicamos Profiling para comprobar que se hayan eliminado los NaN. Además, nos sirve para realizar un estudio sobre cuáles de las nuevas columnas serán útiles para nuestro proyecto

In [None]:
prof = ProfileReport(business_eeuu_categorized)
prof.to_file(output_file='analysis/business_eeuu_categorized_test.html')

In [None]:
IPython.display.HTML(filename='analysis/business_eeuu_categorized_test.html') 

Eliminamos las columnas que no necesitamos

In [None]:
business_canada_categorized = business_canada_categorized.drop(columns=['GoodForMeal', 'Alcohol', 'RestaurantsTakeOut', 'BusinessAcceptsCreditCards'
, 'Ambience', 'BusinessParking', 'RestaurantsTableService', 'BikeParking', 'RestaurantsAttire', 'BYOBCorkage', 'DriveThru', 'HappyHour'
,'BusinessAcceptsBitcoin', 'CoatCheck', 'Smoking', 'Music', 'BestNights', 'WheelchairAccessible', 'GoodForDancing', 'DogsAllowed'
, 'ByAppointmentOnly', 'AgesAllowed', 'DietaryRestrictions', 'HairSpecializesIn', 'AcceptsInsurance'
,'RestaurantsCounterService'])

In [None]:
business_canada_categorized

Algunos valores de las columnas se han convertido incorrectamente en formato Unicode. Simplemente eliminamos las comillas y el caracter 'u' y ya tenemos el dataframe listo.

In [None]:
business_canada_categorized['NoiseLevel'] = business_canada_categorized['NoiseLevel'].str.strip("u").str.strip("\'")
business_canada_categorized['WiFi'] = business_canada_categorized['WiFi'].str.strip("u").str.strip("\'")

In [None]:
business_canada_categorized

In [None]:
business_canada_categorized.to_csv('data/business/segmentation/canada/with_attributes/business_filtered_canada_categorized.csv', index=False)