In [255]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import re

from fuzzywuzzy import process, fuzz

In [256]:
path = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir) + "/data/raw")


In [257]:
cwd = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir))

In [258]:
os.listdir(path)

['BX-Books.csv',
 'BX-NewBooks.csv',
 'BX-NewBooksRatings.csv',
 'BX-NewBooksUsers.csv',
 'BX-Ratings.csv',
 'BX-Users.csv']

# Cleaning Bx-Users.csv

In [259]:
users = pd.read_csv(path + "/BX-Users.csv")
users.columns

Index(['User-ID', 'User-City', 'User-State', 'User-Country', 'User-Age'], dtype='object')

In [260]:
users.shape

(48299, 5)

In [261]:
users.dtypes

User-ID          int64
User-City       object
User-State      object
User-Country    object
User-Age        object
dtype: object

In [262]:
users.head()

Unnamed: 0,User-ID,User-City,User-State,User-Country,User-Age
0,8,timmins,ontario,"canada""",
1,9,germantown,tennessee,"usa""",
2,16,albuquerque,new mexico,"usa""",
3,17,chesapeake,virginia,"usa""",
4,19,weston,,,"14"""


#### Clean Country Names

In [263]:
non_string_countries = users[~users['User-Country'].apply(lambda x: isinstance(x, str))]
non_string_countries

Unnamed: 0,User-ID,User-City,User-State,User-Country,User-Age
4,19,weston,,,"14"""
133,777,portland,,,"13"""
160,929,king of prussia,,,"36"""
198,1131,berkeley,,,"22"""
284,1589,clio,michigan,,"50"""
...,...,...,...,...,...
47674,275358,pueblo,,,"15"""
47827,276240,smyrna,,,"28"""
47865,276496,the colony,,,"45"""
47869,276538,cologne,,,"26"""


##### Clean values supposed to be NaN

In [264]:
pattern_1 = r'[xX]{2,6}'  # Matches 2 to 6 occurrences of "X"
pattern_2 = r'\b(n/a)\b'

# Replace matching values with np.nan
for column in ['User-Country', 'User-State', 'User-City']:
    # Replace matching values with np.nan using the respective pattern
    users[column] = users[column].replace(pattern_1, np.nan, regex=True)
    users[column] = users[column].replace(pattern_2, np.nan)

# Fill remaining NaN values with np.nan
users.fillna(np.nan, inplace=True)




#### Strip apostrophe and spaces

In [265]:

users['User-Country'] = users['User-Country'].apply(lambda x: x.strip().strip('"') if pd.notnull(x) and isinstance(x, str) else x)
users['User-State'] = users['User-State'].apply(lambda x: x.strip().strip('"') if pd.notnull(x) and isinstance(x, str) else x)
users['User-City'] = users['User-City'].apply(lambda x: x.strip().strip('"') if pd.notnull(x) and isinstance(x, str) else x)

users.head()


Unnamed: 0,User-ID,User-City,User-State,User-Country,User-Age
0,8,timmins,ontario,canada,
1,9,germantown,tennessee,usa,
2,16,albuquerque,new mexico,usa,
3,17,chesapeake,virginia,usa,
4,19,weston,,,"14"""


#### Fill in empty countries via database

In [268]:
cities_path = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir) + "/data/cities")

cities = pd.read_csv(cities_path + "/worldcities.csv")


In [269]:
us_cities = pd.read_csv(cities_path + "/uscities.csv")
cities = pd.concat([cities,us_cities])

##### Case folding

I also altered the format for the database i found online to match our situation better

In [270]:
cities['city_ascii'] = cities['city_ascii'].str.lower()
cities['country'] = cities['country'].str.lower()
cities['country'] = cities['country'].replace('korea, south', 'south korea', regex=True)
cities['country'] = cities['country'].replace('united states', 'usa', regex=True)
cities['states'] = cities['states'].str.lower()

cities = cities[['city_ascii','country', 'states']]
cities




#### Fuzzy Match mispelt cities/states/countries

In [276]:
def fuzzy_match(value, choices, threshold=0):
    match = process.extractOne(str(value), choices, score_cutoff=threshold)
    return match[0] if match else None


In [277]:
users['User-City'] = users['User-City'].apply(lambda x: fuzzy_match(x, cities['city_ascii']) if pd.notnull(x) else x)
users['User-State'] = users['User-State'].apply(lambda x: fuzzy_match(x, cities['states']) if pd.notnull(x) else x)
users['User-Country'] = users['User-Country'].apply(lambda x: fuzzy_match(x, cities['country']) if pd.notnull(x) else x)


#### Impute missing values into cities

In [None]:
for index, row in users.iterrows():
    if pd.isna(row['User-State']):
        if not pd.isna(row['User-City']):
            state = cities.loc[cities['city'] == row['User-City'], 'states'].iloc[0]
            users.at[index, 'User-State'] = state

    if pd.isna(row['User-Country']):
        if not pd.isna(row['User-City']):
            country = cities.loc[cities['city'] == row['User-City'], 'country'].iloc[0]
            users.at[index, 'User-Country'] = country
        elif not pd.isna(row['User-State']):
            country = cities.loc[cities['states'] == row['User-State'], 'country'].iloc[0]
            users.at[index, 'User-Country'] = country



## Predicting Age

We proceed by doing Knn imputation, first we set up the pipeline and encode the categorical data

In [None]:
data = users.dropna(subset=['User-Age'])

X = data.drop(columns=['User-Age'])
y = data['User-Age']

# Define categorical and numerical columns
categorical_cols = ['User-City', 'User-State', 'User-Country']
numerical_cols = ['User-Age']

# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols),
        ('num', 'passthrough', numerical_cols)
    ])

# Define KNN imputer
knn_imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='euclidean')

# Create pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('imputer', knn_imputer)])

# Fit pipeline to training data
pipeline.fit(X, y)


Next we use KNN Imputer

In [None]:
missing_age_indices = users['User-Age'].isna()
imputed_age = pipeline.transform(users.loc[missing_age_indices, categorical_cols + numerical_cols])
users.loc[missing_age_indices, 'User-Age'] = imputed_age[:, -1]

# Cleaning Bx-NewBooksUsers

In [None]:
new_users = pd.read_csv(path + "/BX-NewBooksUsers.csv")
new_users.columns

#### Fixing should be Nan

In [None]:
pattern_1 = r'[xX]{2,6}'  # Matches 2 to 6 occurrences of "X"
pattern_2 = r'\b(n/a)\b'

# Replace matching values with np.nan
for column in ['User-Country', 'User-State', 'User-City']:
    # Replace matching values with np.nan using the respective pattern
    new_users[column] = new_users[column].replace(pattern_1, np.nan, regex=True)
    new_users[column] = new_users[column].replace(pattern_2, np.nan)

# Fill remaining NaN values with np.nan
new_users.fillna(np.nan, inplace=True)


#### Strip apostophe and spaces

In [None]:
new_users['User-Country'] = new_users['User-Country'].apply(lambda x: x.strip().strip('"') if pd.notnull(x) and isinstance(x, str) else x)
new_users['User-State'] = new_users['User-State'].apply(lambda x: x.strip().strip('"') if pd.notnull(x) and isinstance(x, str) else x)
new_users['User-City'] = new_users['User-City'].apply(lambda x: x.strip().strip('"') if pd.notnull(x) and isinstance(x, str) else x)

users.head()

#### Fuzzy Match

In [None]:
new_users['User-City'] = new_users['User-City'].apply(lambda x: fuzzy_match(x, cities['city_ascii']) if pd.notnull(x) else x)
new_users['User-State'] = new_users['User-State'].apply(lambda x: fuzzy_match(x, cities['states']) if pd.notnull(x) else x)
new_users['User-Country'] =new_ users['User-Country'].apply(lambda x: fuzzy_match(x, cities['country']) if pd.notnull(x) else x)


#### Imput missing values into states/countries

In [None]:
for index, row in new_users.iterrows():
    if pd.isna(row['User-State']):
        if not pd.isna(row['User-City']):
            state = cities.loc[cities['city'] == row['User-City'], 'states'].iloc[0]
            new_users.at[index, 'User-State'] = state

    if pd.isna(row['User-Country']):
        if not pd.isna(row['User-City']):
            country = cities.loc[cities['city'] == row['User-City'], 'country'].iloc[0]
            new_users.at[index, 'User-Country'] = country
        elif not pd.isna(row['User-State']):
            country = cities.loc[cities['states'] == row['User-State'], 'country'].iloc[0]
            new_users.at[index, 'User-Country'] = country

### Predicting age

Encode categorical data into numerical data

In [None]:
data = new_users.dropna(subset=['User-Age'])

X = data.drop(columns=['User-Age'])
y = data['User-Age']

# Define categorical and numerical columns
categorical_cols = ['User-City', 'User-State', 'User-Country']
numerical_cols = ['User-Age']

# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols),
        ('num', 'passthrough', numerical_cols)
    ])

# Define KNN imputer
knn_imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='euclidean')

# Create pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('imputer', knn_imputer)])

# Fit pipeline to training data
pipeline.fit(X, y)


Impute with Knn imputer

In [None]:
missing_age_indices = users['User-Age'].isna()
imputed_age = pipeline.transform(users.loc[missing_age_indices, categorical_cols + numerical_cols])
new_users.loc[missing_age_indices, 'User-Age'] = imputed_age[:, -1]