# Beautiful numbers - ML

## Load libraries and data

In [90]:
# pandas + numpy
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib.pyplot as plt

# statsmodels
import statsmodels.formula.api as smf
import statsmodels.api as sm


#sklearn
from sklearn.model_selection import train_test_split

# functions
import sys
# append the path of the parent directory
sys.path.append("..")
from functions import *



In [63]:
# prepare data
features = pd.read_csv('../data/beauty_metrics.csv', dtype={'str_n':str})
target = pd.read_csv('../data/venta_por_nr.csv')

# Tune the parameters of the model to improve the R²
features = features.drop(columns = ['n', 'str_n', 'is_odd', 'start_digit', 'repeat_sum', 'has_repeated_digits','repeat_max', 'repeat_digit_count', 'dist_digits_count', 'ends_00', 'starts_00', 'is_prime', 'starts_15'])

# get dummies 
features = pd.get_dummies(features, columns=['repeat_consec_max'], drop_first=True)

target = target['median']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.30, random_state=42)


In [56]:
features.columns

Index(['ends_0', 'ends_13', 'ends_15', 'ends_5', 'ends_69', 'ends_7',
       'ends_prime', 'odd_count', 'starts_0', 'starts_13',
       'has_repeated_digits', 'is_date', 'is_palindrome', 'is_postal_code',
       'is_series', 'repeat_sum_7', 'repeat_sum_9', 'repeat_sum_11',
       'repeat_sum_13', 'repeat_sum_17', 'repeat_sum_25'],
      dtype='object')

## Models

### Linear Regression

In [64]:
df = pd.concat([features, target], axis=1)
all_columns = "+".join(features.columns)
res = smf.glm(formula="median ~" + all_columns, data=df, family=sm.families.Binomial()).fit()
print(res.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                 median   No. Observations:               100000
Model:                            GLM   Df Residuals:                    99981
Model Family:                Binomial   Df Model:                           18
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -32703.
Date:                Mon, 21 Oct 2024   Deviance:                       14873.
Time:                        14:52:23   Pearson chi2:                 1.55e+04
No. Iterations:                     8   Pseudo R-squ. (CS):            0.09050
Covariance Type:            nonrobust                                         
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept         

In [65]:
params = res.params 
conf = res.conf_int() 
conf['Odds Ratio'] = params 
conf.columns = ['5%', '95%', 'Odds Ratio'] 
print(np.exp(conf))

                                    5%        95%  Odds Ratio
Intercept                     5.067676   5.568984    5.312420
ends_0[T.True]                0.527331   0.584528    0.555193
ends_13[T.True]              16.303060  63.825848   32.257660
ends_15[T.True]               1.362263   2.469768    1.834250
ends_5[T.True]                2.336381   2.717654    2.519816
ends_69[T.True]               2.658794   4.402334    3.421242
ends_7[T.True]                2.504533   2.905461    2.697559
ends_prime[T.True]            0.918203   1.000766    0.958596
starts_0[T.True]              0.313749   0.348297    0.330572
starts_13[T.True]             1.738337   2.848853    2.225369
is_date[T.True]               1.173373   1.268816    1.220162
is_palindrome[T.True]         1.022679   1.476713    1.228903
is_postal_code[T.True]        1.143934   1.277847    1.209037
is_series[T.True]             0.365618  18.002158    2.565525
repeat_consec_max_2[T.True]   0.504302   0.540701    0.522184
repeat_c

In [66]:
pred = res.predict(features)

print("MAE: ", mean_absolute_error(pred, target))
print("RMSE: ", root_mean_squared_error(pred, target))
#print("R2 score", res.score(features, target))

# find the correlation between pred and target
print("R2: ", (np.corrcoef(pred, target)[1][0])**2)


MAE:  0.10762879292517351
RMSE:  0.15092569490307217
R2:  0.42302573459391696


### ML models

In [89]:
## sklearn
compare_models(X_train, y_train, X_test, y_test)

Decision Tree
MAE 0.10544065004930772
RMSE 0.14977252296692506
r22:  -0.22784834145641875
R2 score 0.44739450889117893 

Random Forest
MAE 0.10537794834736616
RMSE 0.14957560127248945
r22:  -0.22773109939911085
R2 score 0.4488466907594033 

Linear Regressor
MAE 0.11110038783225182
RMSE 0.1552732130639922
r22:  -0.5571637087198473
R2 score 0.40605807174008457 

KNN
MAE 0.11560561727198455
RMSE 0.16465746625759456
r22:  -0.3071130600238259
R2 score 0.3320964356197985 

Bagging Regressor
MAE 0.10687444212069029
RMSE 0.1506141895511086
r22:  -0.32436874562205253
R2 score 0.44116617756070087 

Gradient Boosting Regressor
MAE 0.10521859695012961
RMSE 0.14923866060070465
r22:  -0.2341503728515022
R2 score 0.4513269990439366 

AdaBoostRegressor
MAE 0.21196892263171818
RMSE 0.2473469974707134
r22:  -2.194404782670887
R2 score -0.5071767654214143 

xgb_reg
MAE 0.1054102512441464
RMSE 0.14970768900883283
r22:  -0.22780342029491663
R2 score 0.44787283223067076 

