# IMPORT

In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from joblib import dump

import neptune.new as neptune
import optuna
import neptune.new.integrations.optuna as optuna_utils

# DEFINE

In [2]:
DATA_PATH = '../data/'

In [3]:
data = pd.read_csv(DATA_PATH+'prep_data.csv')

In [4]:
data

Unnamed: 0,City,Ranking,Rating,Number_of_Reviews,sample,County,Population,Capital,num_of_cuisine_styles,Pakistani,...,Azerbaijani,Lebanese,Argentinean,Nepali,Pizza,Mexican,New Zealand,Taiwanese,Contemporary,Price
0,Paris,5570.0,3.5,194.0,1,France,2190327,1,3,0,...,0,0,0,0,0,0,0,0,0,2.0
1,Stockholm,1537.0,4.0,10.0,1,Sweden,961609,1,1,0,...,0,0,0,0,0,0,0,0,0,2.0
2,London,353.0,4.5,688.0,1,UK,8908081,1,7,0,...,0,0,0,0,0,0,0,0,0,3.0
3,Berlin,3458.0,5.0,3.0,1,Germany,3644826,1,1,0,...,0,0,0,0,0,0,0,0,0,2.0
4,Munich,621.0,4.0,84.0,1,Germany,1456039,0,3,0,...,0,0,0,0,0,0,0,0,0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,London,4367.0,,28.0,0,UK,8908081,1,4,0,...,0,0,0,0,0,0,0,0,0,1.0
49996,Lisbon,2612.0,,2.0,0,Portugalia,505526,1,4,0,...,0,0,0,1,0,0,0,0,0,2.0
49997,Milan,5562.0,,2.0,0,Italy,1378689,0,1,0,...,0,0,0,0,0,0,0,0,0,2.0
49998,Milan,4010.0,,2.0,0,Italy,1378689,0,1,0,...,0,0,0,0,0,0,0,0,0,2.0


In [8]:
train = data[data['sample']==1].drop(columns=['sample'])
test = data[data['sample']==0].drop(columns=['sample'])

In [21]:
def rating(prediction):
        if prediction < 0.25:
            return 0
        elif 0.25 < prediction <= 0.75:
            return 0.5
        elif 0.75 < prediction <= 1.25:
            return 1
        elif 1.25 <prediction <= 1.75:
            return 1.5
        elif 1.75 < prediction <= 2.25:
            return 2
        elif 2.25 < prediction <= 2.75:
            return 2.5
        elif 2.75 < prediction <= 3.25:
            return 3
        elif 3.25 < prediction <= 3.75:
            return 3.5
        elif 3.75 < prediction <= 4.25:
            return 4
        elif 4.25 < prediction <= 4.75:
            return 4.5
        else:
            return 5

In [11]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    mape = mean_absolute_percentage_error(actual, pred)
    return rmse, mae, r2, mape

In [28]:
def nt_sklearn_lr(data, norm=False, log=False):
    
    df = data.copy()
    df.reset_index(inplace=True)
    
    run = neptune.init(project='alxkzncoff/trip-rating',
                       api_token='eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI5ZmI4NDE1Ny03YzcyLTQwOTQtOTUwMi1kODlkYWMwY2YwOGQifQ==',
                       name = 'sklearn_regression')

    # Dummies
    df_dummies = pd.get_dummies(df, 
                                columns=['City', 'County'])

    X = df_dummies.drop(['Rating'], axis=1)
    y = df_dummies['Rating']
    
    # Normalization
    print(f'Normalization: {norm}')
    if norm == True:
        MMS = MinMaxScaler()
        X[['Ranking', 'Number_of_Reviews', 'Population', 'num_of_cuisine_styles']] = MMS.fit_transform(X[['Ranking', 'Number_of_Reviews', 'Population', 'num_of_cuisine_styles']])
        
    print(f'Logarithm: {log}')
    if log == True:
        X[['Ranking', 'Number_of_Reviews', 'Population', 'num_of_cuisine_styles']] = np.log1p(X[['Ranking', 'Number_of_Reviews', 'Population', 'num_of_cuisine_styles']])
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42)
    
    lr = LinearRegression()
    lr.fit(X_train, y_train)

    predict = lr.predict(X_test)
    
    for i in range(predict.size):
        predict[i]=rating(predict[i])
    
    (rmse, mae, r2, mape) = eval_metrics(y_test, predict)

    run["sys/tags"].add(['sklearn_LinearReg'])
    run['sk_LinearRegression/parameters'] = {'normalization': norm,
                                             'logarithm': log}
    run['sk_LinearRegression/rmse'] = np.round(rmse, 2)
    run['sk_LinearRegression/mae'] = np.round(mae, 2)
    run['sk_LinearRegression/r2'] = np.round(r2, 2)
    run['sk_LinearRegression/mape'] = np.round(mape, 2)
    run["sk_LinearRegression/model"].upload('sklearn_linear_regression_model.pkl')

    print(f'RMSE: {np.round(rmse, 2)} | MAE: {np.round(mae, 2)} | R2: {np.round(r2, 2)} | MAPE: {np.round(mape, 2)}')

    dump(lr, 'sklearn_linear_regression_model.pkl')

    return predict

In [29]:
for norm in [False, True]:
    nt_sklearn_lr(train, norm)

https://app.neptune.ai/alxkzncoff/trip-rating/e/TRIP-11
Normalization: False
Logarithm: False
RMSE: 0.59 | MAE: 0.41 | R2: 0.22 | MAPE: 0.12
https://app.neptune.ai/alxkzncoff/trip-rating/e/TRIP-12
Normalization: True
Logarithm: False
RMSE: 0.59 | MAE: 0.41 | R2: 0.22 | MAPE: 0.12


In [30]:
for log in [False, True]:
    nt_sklearn_lr(train, norm)

https://app.neptune.ai/alxkzncoff/trip-rating/e/TRIP-13
Normalization: True
Logarithm: False
RMSE: 0.59 | MAE: 0.41 | R2: 0.22 | MAPE: 0.12
https://app.neptune.ai/alxkzncoff/trip-rating/e/TRIP-14
Normalization: True
Logarithm: False
RMSE: 0.59 | MAE: 0.41 | R2: 0.22 | MAPE: 0.12
