In [255]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from fuzzywuzzy import process, fuzz

In [256]:
path = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir) + "/data/raw")


In [257]:
cwd = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir))

In [258]:
os.listdir(path)

['BX-Books.csv',
 'BX-NewBooks.csv',
 'BX-NewBooksRatings.csv',
 'BX-NewBooksUsers.csv',
 'BX-Ratings.csv',
 'BX-Users.csv']

# Cleaning Bx-Users.csv

In [259]:
users = pd.read_csv(path + "/BX-Users.csv")
users.columns

Index(['User-ID', 'User-City', 'User-State', 'User-Country', 'User-Age'], dtype='object')

In [260]:
users.shape

(48299, 5)

In [261]:
users.dtypes

User-ID          int64
User-City       object
User-State      object
User-Country    object
User-Age        object
dtype: object

In [262]:
users.head()

Unnamed: 0,User-ID,User-City,User-State,User-Country,User-Age
0,8,timmins,ontario,"canada""",
1,9,germantown,tennessee,"usa""",
2,16,albuquerque,new mexico,"usa""",
3,17,chesapeake,virginia,"usa""",
4,19,weston,,,"14"""


#### Clean Country Names

##### Clean values supposed to be NaN

In [264]:
pattern_1 = r'[xX]{2,6}'  # Matches 2 to 6 occurrences of "X"
pattern_2 = r'\b(n/a)\b'

# Replace matching values with np.nan
for column in ['User-Country', 'User-State', 'User-City']:
    # Replace matching values with np.nan using the respective pattern
    users[column] = users[column].replace(pattern_1, np.nan, regex=True)
    users[column] = users[column].replace(pattern_2, np.nan)

# Fill remaining NaN values with np.nan
users.fillna(np.nan, inplace=True)




#### Strip apostrophe and spaces

In [265]:
columns = ['User-Country', 'User-State', 'User-City','User-Age']
for column in columns:
    users[column] = users[column].apply(lambda x: x.strip().strip('"') if pd.notnull(x) and isinstance(x, str) else x)
users.head()

users.head()


Unnamed: 0,User-ID,User-City,User-State,User-Country,User-Age
0,8,timmins,ontario,canada,
1,9,germantown,tennessee,usa,
2,16,albuquerque,new mexico,usa,
3,17,chesapeake,virginia,usa,
4,19,weston,,,"14"""


Fill in empty states/countries via data base from : 

https://simplemaps.com/data/us-cities

https://simplemaps.com/data/world-cities

In [268]:
cities_path = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir) + "/data/cities")
cities = pd.read_csv(cities_path + "/worldcities.csv")
us_cities = pd.read_csv(cities_path + "/uscities.csv")



Extract Relevant collumns

In [269]:
cities = cities[['city_ascii','country', 'admin_name']]
cities = cities[['city_ascii', 'country', 'admin_name']].rename(columns={'admin_name': 'state_name'})
filtered = cities[cities['country'] != 'United States']
print(cities.shape)
cities.head()

In [None]:

us_cities = us_cities[['city_ascii','state_name']]
us_cities['country'] = 'usa'
print(us_cities.shape)
us_cities.head()

In [None]:
cities = pd.concat([cities,us_cities], axis = 0)
cities = cities[['city_ascii', 'country', 'state_name']].rename(columns={'state_name': 'states'})

print(cities.shape)
cities.head()

##### Case folding

I also altered the format for the database i found online to match our situation better

In [270]:
cities['city_ascii'] = cities['city_ascii'].str.lower()
cities['country'] = cities['country'].str.lower()
cities['states'] = cities['states'].str.lower()


Fixing naming deviations

In [None]:
cities['country'] = cities['country'].replace('korea, south', 'south korea', regex=True)
cities['country'] = cities['country'].replace('united states', 'usa', regex=True)

## Imputing corrosponding states and country from cities

We are able to tell what state and country it is from the city, but we can't neccesarily tell the city from the country. <br>
So we will fuzzy match the cities where the row has a city name but not a state or country.<br> We will then find the corrosponding state and city in the cities database and fill those in.<br>



In [None]:
def fuzzy_match(input_string, choices):
    # Use process.extractOne to find the best match
    best_match, score = process.extractOne(input_string, choices)
    return best_match, score
            
        

fredericton


In [2]:
cities_list = cities['city_ascii'].to_list()
cities_list
        

NameError: name 'cities' is not defined

#### Impute missing values into cities

In [None]:
for index, row in users.iterrows():
    if (not pd.isnull(row['User-City'])) and (pd.isnull(row['User-State']) or pd.isnull(row['User-Country'])):
        city_to_find =fuzzy_match(row['User-City'],cities_list)[0]
        print(city_to_find)
        city_mask = cities[cities['city_ascii'] == city_to_find]
        if not city_mask.empty:
            state = city_mask['states'].iloc[0]
            country = city_mask['country'].iloc[0]
            users.loc[index, 'User-State'] = state
            users.loc[index, 'User-Country'] = country


In [None]:
users

## Predicting Age

In [None]:
valid_age = users.dropna(subset=['User-Age'])
valid_age['User-Age'] = pd.to_numeric(valid_age['User-Age'], errors='coerce')
valid_age.dtypes

In [None]:
mean_age_by_country = valid_age.groupby('User-Country')['User-Age'].mean()
overall_mean_age = valid_age['User-Age'].mean()

In [None]:
for index, row in users.iterrows():
    if pd.isna(row['User-Age']):
        country = row['User-Country']
        if country in mean_age_by_country:
            users.at[index, 'User-Age'] = mean_age_by_country[country]
        else:
            users.at[index, 'User-Age'] = overall_mean_age

In [None]:
users

# Cleaning Bx-NewBooksUsers

In [None]:
new_users = pd.read_csv(path + "/BX-NewBooksUsers.csv")
new_users.columns

#### Fixing should be Nan

In [None]:
pattern_1 = r'[xX]{2,6}'  # Matches 2 to 6 occurrences of "X"
pattern_2 = r'\b(n/a)\b'

# Replace matching values with np.nan
for column in ['User-Country', 'User-State', 'User-City']:
    # Replace matching values with np.nan using the respective pattern
    new_users[column] = new_users[column].replace(pattern_1, np.nan, regex=True)
    new_users[column] = new_users[column].replace(pattern_2, np.nan)

# Fill remaining NaN values with np.nan
new_users.fillna(np.nan, inplace=True)


#### Strip apostophe and spaces

In [None]:
columns = ['User-Country', 'User-State', 'User-City','User-Age']
for column in columns:
    new_users[column] = new_users[column].apply(lambda x: x.strip().strip('"') if pd.notnull(x) and isinstance(x, str) else x)
new_users.head()

#### Fuzzy Match and Imputation

In [None]:
for index, row in new_users.iterrows():
    if (not pd.isnull(row['User-City'])) and (pd.isnull(row['User-State']) or pd.isnull(row['User-Country'])):
        city_to_find =fuzzy_match(row['User-City'],cities_list)[0]
        print(city_to_find)
        city_mask = cities[cities['city_ascii'] == city_to_find]
        if not city_mask.empty:
            state = city_mask['states'].iloc[0]
            country = city_mask['country'].iloc[0]
            new_users.loc[index, 'User-State'] = state
            new_users.loc[index, 'User-Country'] = country
            

### Predicting age

In [None]:
valid_age = new_users.dropna(subset=['User-Age'])
valid_age['User-Age'] = pd.to_numeric(valid_age['User-Age'], errors='coerce')
valid_age.dtypes

In [None]:
mean_age_by_country = valid_age.groupby('User-Country')['User-Age'].mean()
overall_mean_age = valid_age['User-Age'].mean()

In [None]:
for index, row in new_users.iterrows():
    if pd.isna(row['User-Age']):
        country = row['User-Country']
        if country in mean_age_by_country:
            new_users.at[index, 'User-Age'] = mean_age_by_country[country]
        else:
            new_users.at[index, 'User-Age'] = overall_mean_age

In [None]:
new_users