In [127]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.impute import SimpleImputer
import re

from fuzzywuzzy import process, fuzz

In [128]:
path = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir) + "/data/raw")

In [129]:
cwd = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir))

In [130]:
os.listdir(path)

['BX-Books.csv',
 'BX-NewBooks.csv',
 'BX-NewBooksRatings.csv',
 'BX-NewBooksUsers.csv',
 'BX-Ratings.csv',
 'BX-Users.csv']

# Cleaning Bx-Users.csv subset (fuzzy wuzzy too long)

In [131]:
users = pd.read_csv(path + "/BX-Users.csv")
users.columns

Index(['User-ID', 'User-City', 'User-State', 'User-Country', 'User-Age'], dtype='object')

In [132]:
users_sample = users.sample(n=5,random_state = 123)

#### Clean Country names

regex clean values that are supposed to be nan

In [133]:
pattern_1 = r'[xX]{2,6}'  # Matches 2 to 6 occurrences of "X"
pattern_2 = r'\b(n/a)\b'

# Replace matching values with np.nan
for column in ['User-Country', 'User-State', 'User-City']:
    # Replace matching values with np.nan using the respective pattern
    users_sample[column] = users_sample[column].replace(pattern_1, np.nan, regex=True)
    users_sample[column] = users_sample[column].replace(pattern_2, np.nan)

# Fill remaining NaN values with np.nan
users_sample.fillna(np.nan, inplace=True)

Strip apostrophe and spaces

In [134]:
columns = ['User-Country', 'User-State', 'User-City','User-Age']
for column in columns:
    users_sample[column] = users_sample[column].apply(lambda x: x.strip().strip('"') if pd.notnull(x) and isinstance(x, str) else x)
users_sample.head()

Unnamed: 0,User-ID,User-City,User-State,User-Country,User-Age
8782,50984,cardiff,wales,united kingdom,18.0
7664,44037,london,england,united kingdom,26.0
37610,217705,nelson,british columbia,canada,
28320,163548,andújar,jaén,spain,33.0
44040,253780,fredericton,new brunswick,,25.0


Fill in empty states/countries via data base from : 

https://simplemaps.com/data/us-cities

https://simplemaps.com/data/world-cities

In [135]:
cities_path = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir) + "/data/cities")

cities = pd.read_csv(cities_path + "/worldcities.csv")
cities = cities[['city_ascii','country', 'admin_name']]
cities = cities[['city_ascii', 'country', 'admin_name']].rename(columns={'admin_name': 'state_name'})
filtered = cities[cities['country'] != 'United States']

print(cities.shape)
cities.head()

(47869, 3)


Unnamed: 0,city_ascii,country,state_name
0,Tokyo,Japan,Tōkyō
1,Jakarta,Indonesia,Jakarta
2,Delhi,India,Delhi
3,Guangzhou,China,Guangdong
4,Mumbai,India,Mahārāshtra


In [136]:
us_cities = pd.read_csv(cities_path + "/uscities.csv")
us_cities = us_cities[['city_ascii','state_name']]
us_cities['country'] = 'usa'
print(us_cities.shape)
us_cities.head()

(31120, 3)


Unnamed: 0,city_ascii,state_name,country
0,New York,New York,usa
1,Los Angeles,California,usa
2,Chicago,Illinois,usa
3,Miami,Florida,usa
4,Houston,Texas,usa


In [137]:
cities = pd.concat([cities,us_cities], axis = 0)
cities = cities[['city_ascii', 'country', 'state_name']].rename(columns={'state_name': 'states'})

print(cities.shape)
cities.head()

(78989, 3)


Unnamed: 0,city_ascii,country,states
0,Tokyo,Japan,Tōkyō
1,Jakarta,Indonesia,Jakarta
2,Delhi,India,Delhi
3,Guangzhou,China,Guangdong
4,Mumbai,India,Mahārāshtra


Case Folding database for fuzzy match

In [138]:
cities['city_ascii'] = cities['city_ascii'].str.lower()
cities['country'] = cities['country'].str.lower()
cities['states'] = cities['states'].str.lower()

Fixing naming deviations

In [139]:
cities['country'] = cities['country'].replace('korea, south', 'south korea', regex=True)
cities['country'] = cities['country'].replace('united states', 'usa', regex=True)

## Imputing corrosponding states and country from cities

We are able to tell what state and country it is from the city, but we can't neccesarily tell the city from the country. <br>
So we will fuzzy match the cities where the row has a city name but not a state or country.<br> We will then find the corrosponding state and city in the cities database and fill those in.<br>



In [140]:
def fuzzy_match(input_string, choices):
    # Use process.extractOne to find the best match
    best_match, score = process.extractOne(input_string, choices)
    return best_match, score

In [141]:
cities_list = cities['city_ascii'].to_list()
cities_list

['tokyo',
 'jakarta',
 'delhi',
 'guangzhou',
 'mumbai',
 'manila',
 'shanghai',
 'sao paulo',
 'seoul',
 'mexico city',
 'cairo',
 'new york',
 'dhaka',
 'beijing',
 'kolkata',
 'bangkok',
 'shenzhen',
 'moscow',
 'buenos aires',
 'lagos',
 'istanbul',
 'karachi',
 'bangalore',
 'ho chi minh city',
 'osaka',
 'chengdu',
 'tehran',
 'kinshasa',
 'rio de janeiro',
 'chennai',
 "xi'an",
 'lahore',
 'chongqing',
 'los angeles',
 'baoding',
 'london',
 'paris',
 'linyi',
 'dongguan',
 'hyderabad',
 'tianjin',
 'lima',
 'wuhan',
 'nanyang',
 'hangzhou',
 'foshan',
 'nagoya',
 'tongshan',
 'luanda',
 'zhoukou',
 'ganzhou',
 'kuala lumpur',
 'heze',
 'quanzhou',
 'johannesburg',
 'chicago',
 'nanjing',
 'jining',
 'hanoi',
 'pune',
 'fuyang',
 'ahmedabad',
 'bogota',
 'shenyang',
 'dar es salaam',
 'khartoum',
 'shangqiu',
 'hong kong',
 'cangzhou',
 'riyadh',
 'santiago',
 'xingtai',
 'zhumadian',
 'chattogram',
 'surabaya',
 'zhanjiang',
 'bijie',
 'yancheng',
 'hengyang',
 'zunyi',
 'shaoy

In [142]:
for index, row in users_sample.iterrows():
    if (not pd.isnull(row['User-City'])) and (pd.isnull(row['User-State']) or pd.isnull(row['User-Country'])):
        city_to_find =fuzzy_match(row['User-City'],cities_list)[0]
        print(city_to_find)
        city_mask = cities[cities['city_ascii'] == city_to_find]
        if not city_mask.empty:
            state = city_mask['states'].iloc[0]
            country = city_mask['country'].iloc[0]
            users_sample.loc[index, 'User-State'] = state
            users_sample.loc[index, 'User-Country'] = country
            
        

fredericton


In [143]:
users_sample

Unnamed: 0,User-ID,User-City,User-State,User-Country,User-Age
8782,50984,cardiff,wales,united kingdom,18.0
7664,44037,london,england,united kingdom,26.0
37610,217705,nelson,british columbia,canada,
28320,163548,andújar,jaén,spain,33.0
44040,253780,fredericton,new brunswick,canada,25.0


### Cleaning Age

In [156]:
users_sample.dtypes
users_sample

Unnamed: 0,User-ID,User-City,User-State,User-Country,User-Age
8782,50984,cardiff,wales,united kingdom,18.0
7664,44037,london,england,united kingdom,26.0
37610,217705,nelson,british columbia,canada,
28320,163548,andújar,jaén,spain,33.0
44040,253780,fredericton,new brunswick,canada,25.0


Imputing Age

In [162]:
valid_age = users_sample.dropna(subset=['User-Age'])
valid_age['User-Age'] = pd.to_numeric(valid_age['User-Age'], errors='coerce')
valid_age.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_age['User-Age'] = pd.to_numeric(valid_age['User-Age'], errors='coerce')


User-ID          int64
User-City       object
User-State      object
User-Country    object
User-Age         int64
dtype: object

In [163]:
mean_age_by_country = valid_age.groupby('User-Country')['User-Age'].mean()
overall_mean_age = valid_age['User-Age'].mean()

In [164]:
for index, row in users_sample.iterrows():
    if pd.isna(row['User-Age']):
        country = row['User-Country']
        if country in mean_age_by_country:
            users_sample.at[index, 'User-Age'] = mean_age_by_country[country]
        else:
            users_sample.at[index, 'User-Age'] = overall_mean_age

In [165]:
users_sample

Unnamed: 0,User-ID,User-City,User-State,User-Country,User-Age
8782,50984,cardiff,wales,united kingdom,18.0
7664,44037,london,england,united kingdom,26.0
37610,217705,nelson,british columbia,canada,25.0
28320,163548,andújar,jaén,spain,33.0
44040,253780,fredericton,new brunswick,canada,25.0
