In [255]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from fuzzywuzzy import process, fuzz

In [256]:
path = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir) + "/data/raw")


In [257]:
cwd = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir))

In [258]:
os.listdir(path)

['BX-Books.csv',
 'BX-NewBooks.csv',
 'BX-NewBooksRatings.csv',
 'BX-NewBooksUsers.csv',
 'BX-Ratings.csv',
 'BX-Users.csv']

# Cleaning Bx-Users.csv

In [259]:
users = pd.read_csv(path + "/BX-Users.csv")
users.columns

Index(['User-ID', 'User-City', 'User-State', 'User-Country', 'User-Age'], dtype='object')

In [260]:
users.shape

(48299, 5)

In [261]:
users.dtypes

User-ID          int64
User-City       object
User-State      object
User-Country    object
User-Age        object
dtype: object

In [262]:
users.head()

Unnamed: 0,User-ID,User-City,User-State,User-Country,User-Age
0,8,timmins,ontario,"canada""",
1,9,germantown,tennessee,"usa""",
2,16,albuquerque,new mexico,"usa""",
3,17,chesapeake,virginia,"usa""",
4,19,weston,,,"14"""


#### Clean Country Names

#### Strip apostrophe and spaces

In [265]:
columns = ['User-Country', 'User-State', 'User-City','User-Age']
for column in columns:
    users[column] = users[column].apply(lambda x: x.strip().strip('"') if pd.notnull(x) and isinstance(x, str) else x)
users.head()

users.head()


Unnamed: 0,User-ID,User-City,User-State,User-Country,User-Age
0,8,timmins,ontario,canada,
1,9,germantown,tennessee,usa,
2,16,albuquerque,new mexico,usa,
3,17,chesapeake,virginia,usa,
4,19,weston,,,"14"""


##### Clean values supposed to be NaN

In [None]:
pattern_1 = r'[xX]{2,6}'  # Matches 2 to 6 occurrences of "X"
pattern_2 = r'\b(n/a)\b' # Matches n/a 
pattern_3 = r'^\s$|^$' #matches whitespace entries
pattern_4 = r'-' # matches hyphen


# Replace matching values with np.nan
for column in ['User-Country', 'User-State', 'User-City']:
    # Replace matching values with np.nan using the respective pattern
    users[column] = users[column].replace(pattern_1, np.nan, regex=True)
    users[column] = users[column].replace(pattern_2, np.nan, regex =True)
    users[column] = users[column].replace(pattern_3, np.nan, regex=True)
    users[column] = users[column].replace(pattern_4, np.nan, regex=True)

  
# Fill remaining NaN values with np.nan
users.fillna(np.nan, inplace=True)

## Fix Abbreviated names

Dictionary of abbreviations

In [9]:
abbreviation_dict = {
    # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#States.
    "AK": "Alaska",
    "AL": "Alabama",
    "AR": "Arkansas",
    "AZ": "Arizona",
    "CA": "California",
    "CO": "Colorado",
    "CT": "Connecticut",
    "DE": "Delaware",
    "FL": "Florida",
    "GA": "Georgia",
    "HI": "Hawaii",
    "IA": "Iowa",
    "ID": "Idaho",
    "IL": "Illinois",
    "IN": "Indiana",
    "KS": "Kansas",
    "KY": "Kentucky",
    "LA": "Louisiana",
    "MA": "Massachusetts",
    "MD": "Maryland",
    "ME": "Maine",
    "MI": "Michigan",
    "MN": "Minnesota",
    "MO": "Missouri",
    "MS": "Mississippi",
    "MT": "Montana",
    "NC": "North Carolina",
    "ND": "North Dakota",
    "NE": "Nebraska",
    "NH": "New Hampshire",
    "NJ": "New Jersey",
    "NM": "New Mexico",
    "NV": "Nevada",
    "NY": "New York",
    "OH": "Ohio",
    "OK": "Oklahoma",
    "OR": "Oregon",
    "PA": "Pennsylvania",
    "RI": "Rhode Island",
    "SC": "South Carolina",
    "SD": "South Dakota",
    "TN": "Tennessee",
    "TX": "Texas",
    "UT": "Utah",
    "VA": "Virginia",
    "VT": "Vermont",
    "WA": "Washington",
    "WI": "Wisconsin",
    "WV": "West Virginia",
    "WY": "Wyoming",
    # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#Federal_district.
    "DC": "District of Columbia",
    # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#Inhabited_territories.
    "AS": "American Samoa",
    "GU": "Guam GU",
    "MP": "Northern Mariana Islands",
    "PR": "Puerto Rico PR",
    "VI": "U.S. Virgin Islands",
    'sg': 'singapore',
    'jax': "Jacksonville",
    'nyc' : "New York City",
    'ny' : 'New York',
    'la':'Las Vegas',
    'wi': "Wisconsin",
    'sj' : 'San Jose',
    'pdx':'portland',
    'atl' : 'atlanta',
    'rtr' : 'Remedios T. Romualdez',
    'phx' : 'phoenix',
    'hyd':'Hyderabad',
    'bcn': 'Barcelona',
    'ala':'alabama',
    'rr':'round rock', 
    'dc': "washington d.c",
    'apo': 'apopka',
    'kdh':'kill devil hills',
    'yvr': 'vancouver',
    'okc' : 'oklahoma city',
    'abq' :'albuquerque',
    'pve': 'palos verdes estates' ,
    'dfb' :'deerfield beach',
    'pj' : 'petaling Jaya',
    'van' : 'vancouver',
    'rsm' : 'Rancho Santa Margarita',
    'rvc' : " Rockville centre",
    'srq' : 'Sarasota',
    'br' : 'baton rouge',
    'kl' : "Kuala Lumpur",
    'kc' :'kansas city',
    'abc' : "alphabet city",
    'sf' : "San Francisco",
    'slc' : 'salt lake city',
    'wbl' : 'white bear lake', 
    'rtp' : 'research triangle park', 
    'li' : 'long island', 
    'hhi' : 'Hilton head island',
    
    }

In [4]:
def lowercase_dict(data):
  """Creates a new dictionary with lowercase keys."""
  return {key.lower(): value for key, value in data.items()}

In [8]:
lowercase_keys = lowercase_dict(abbreviation_dict)
print(lowercase_keys)

{'ak': 'Alaska', 'al': 'Alabama', 'ar': 'Arkansas', 'az': 'Arizona', 'ca': 'California', 'co': 'Colorado', 'ct': 'Connecticut', 'de': 'Delaware', 'fl': 'Florida', 'ga': 'Georgia', 'hi': 'Hawaii', 'ia': 'Iowa', 'id': 'Idaho', 'il': 'Illinois', 'in': 'Indiana', 'ks': 'Kansas', 'ky': 'Kentucky', 'la': 'Las Vegas', 'ma': 'Massachusetts', 'md': 'Maryland', 'me': 'Maine', 'mi': 'Michigan', 'mn': 'Minnesota', 'mo': 'Missouri', 'ms': 'Mississippi', 'mt': 'Montana', 'nc': 'North Carolina', 'nd': 'North Dakota', 'ne': 'Nebraska', 'nh': 'New Hampshire', 'nj': 'New Jersey', 'nm': 'New Mexico', 'nv': 'Nevada', 'ny': 'New York', 'oh': 'Ohio', 'ok': 'Oklahoma', 'or': 'Oregon', 'pa': 'Pennsylvania', 'ri': 'Rhode Island', 'sc': 'South Carolina', 'sd': 'South Dakota', 'tn': 'Tennessee', 'tx': 'Texas', 'ut': 'Utah', 'va': 'Virginia', 'vt': 'Vermont', 'wa': 'Washington', 'wi': 'Wisconsin', 'wv': 'West Virginia', 'wy': 'Wyoming', 'dc': 'washington d.c', 'as': 'American Samoa', 'gu': 'Guam GU', 'mp': 'North

In [None]:
cols = ['User-City','User-State']
for index, row in users.iterrows():
    for col in cols:            
        if row[col] in abbreviation_dict:
            users.at[index, col] = abbreviation_dict[row['col']]


Fill in empty states/countries via data base from : 

https://simplemaps.com/data/us-cities

https://simplemaps.com/data/world-cities

In [268]:
cities_path = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir) + "/data/cities")
cities = pd.read_csv(cities_path + "/worldcities.csv")
us_cities = pd.read_csv(cities_path + "/uscities.csv")



Extract Relevant collumns

In [269]:
cities = cities[['city_ascii','country', 'admin_name']]
cities = cities[['city_ascii', 'country', 'admin_name']].rename(columns={'admin_name': 'state_name'})
filtered = cities[cities['country'] != 'United States']
print(cities.shape)
cities.head()

In [None]:

us_cities = us_cities[['city_ascii','state_name']]
us_cities['country'] = 'usa'
print(us_cities.shape)
us_cities.head()

In [None]:
cities = pd.concat([cities,us_cities], axis = 0)
cities = cities[['city_ascii', 'country', 'state_name']].rename(columns={'state_name': 'states'})

print(cities.shape)
cities.head()

##### Case folding

I also altered the format for the database i found online to match our situation better

In [270]:
cities['city_ascii'] = cities['city_ascii'].str.lower()
cities['country'] = cities['country'].str.lower()
cities['states'] = cities['states'].str.lower()


Fixing naming deviations

In [None]:
cities['country'] = cities['country'].replace('korea, south', 'south korea', regex=True)
cities['country'] = cities['country'].replace('united states', 'usa', regex=True)

## Imputing corrosponding states and country from cities

We are able to tell what state and country it is from the city, but we can't neccesarily tell the city from the country. <br>
So we will fuzzy match the cities where the row has a city name but not a state or country.<br> We will then find the corrosponding state and city in the cities database and fill those in.<br>



In [None]:
def fuzzy_match(input_string, choices):
    # Use process.extractOne to find the best match
    best_match, score = process.extractOne(input_string, choices)
    return best_match, score
            
        

fredericton


In [2]:
cities_list = cities['city_ascii'].to_list()
cities_list
        

NameError: name 'cities' is not defined

#### Impute missing values into cities

In [None]:
for index, row in users.iterrows():
    if (not pd.isnull(row['User-City'])) and (pd.isnull(row['User-State']) or pd.isnull(row['User-Country'])):
        city_to_find =fuzzy_match(row['User-City'],cities_list)[0]
        print(city_to_find)
        city_mask = cities[cities['city_ascii'] == city_to_find]
        if not city_mask.empty:
            state = city_mask['states'].iloc[0]
            country = city_mask['country'].iloc[0]
            users.loc[index, 'User-State'] = state
            users.loc[index, 'User-Country'] = country


In [None]:
users

## Predicting Age

In [None]:
valid_age = users.dropna(subset=['User-Age'])
valid_age['User-Age'] = pd.to_numeric(valid_age['User-Age'], errors='coerce')
valid_age.dtypes

In [None]:
mean_age_by_country = valid_age.groupby('User-Country')['User-Age'].mean()
overall_mean_age = valid_age['User-Age'].mean()

Imputation for Nan and setting outliers as mean

In [None]:
for index, row in users.iterrows():
    if pd.isna(row['User-Age']) or row['User-Age'] > 90 or row['User-Age'] < 0:
        country = row['User-Country']
        if country in mean_age_by_country:
            users.at[index, 'User-Age'] = mean_age_by_country[country]
        else:
            users.at[index, 'User-Age'] = overall_mean_age

In [None]:
users

# Cleaning Bx-NewBooksUsers

In [None]:
new_users = pd.read_csv(path + "/BX-NewBooksUsers.csv")
new_users.columns

#### Fixing should be Nan

In [None]:
pattern_1 = r'[xX]{2,6}'  # Matches 2 to 6 occurrences of "X"
pattern_2 = r'\b(n/a)\b' # Matches n/a 
pattern_3 = r'^\s$|^$' #matches whitespace entries

# Replace matching values with np.nan
for column in ['User-Country', 'User-State', 'User-City']:
    # Replace matching values with np.nan using the respective pattern
    new_users[column] = new_users[column].replace(pattern_1, np.nan, regex=True)
    new_users[column] = new_users[column].replace(pattern_2, np.nan)
    new_users[column] = new_users[column].replace(pattern_3, np.nan, regex=True)

# Fill remaining NaN values with np.nan
new_users.fillna(np.nan, inplace=True)


#### Strip apostophe and spaces

In [None]:
columns = ['User-Country', 'User-State', 'User-City','User-Age']
for column in columns:
    new_users[column] = new_users[column].apply(lambda x: x.strip().strip('"') if pd.notnull(x) and isinstance(x, str) else x)
new_users.head()

### Fix Abreviations


In [None]:
cols = ['User-City','User-State']
for index, row in users.iterrows():
    for col in cols:            
        if row[col] in abbreviation_dict:
            users.at[index, col] = abbreviation_dict[row['col']]

#### Fuzzy Match and Imputation

In [None]:
for index, row in new_users.iterrows():
    if (not pd.isnull(row['User-City'])) and (pd.isnull(row['User-State']) or pd.isnull(row['User-Country'])):
        city_to_find =fuzzy_match(row['User-City'],cities_list)[0]
        print(city_to_find)
        city_mask = cities[cities['city_ascii'] == city_to_find]
        if not city_mask.empty:
            state = city_mask['states'].iloc[0]
            country = city_mask['country'].iloc[0]
            new_users.loc[index, 'User-State'] = state
            new_users.loc[index, 'User-Country'] = country
            

### Predicting age

We impute Nan age with the country mean

In [None]:
valid_age = new_users.dropna(subset=['User-Age'])
valid_age['User-Age'] = pd.to_numeric(valid_age['User-Age'], errors='coerce')
valid_age.dtypes

In [None]:
mean_age_by_country = valid_age.groupby('User-Country')['User-Age'].mean()
overall_mean_age = valid_age['User-Age'].mean()

Imputation and also fixing outliers

In [None]:
for index, row in new_users.iterrows():
    if pd.isna(row['User-Age']) or row['User-Age'] > 90 or row['User-Age'] < 0:
        country = row['User-Country']
        if country in mean_age_by_country:
            new_users.at[index, 'User-Age'] = mean_age_by_country[country]
        else:
            new_users.at[index, 'User-Age'] = overall_mean_age

In [None]:
new_users

## Save final df to csv

In [None]:
users.to_csv('Bx-Cleaned-Users')
new_users.to_csv('Bx-Cleaned-NewBooksUsers')