## Importing Libraries

In [88]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)
from feature_engine.encoding import (
    OrdinalEncoder,
)
from sklearn.pipeline import Pipeline
import xgboost as xgb
import joblib

## Importing Dataset

In [89]:
df = pd.read_csv("data.csv")
mean_imputer_target = MeanMedianImputer(
    imputation_method='mean', variables=['Monthly_expenses_$'])

mean_imputer_target.fit(df)
df = mean_imputer_target.transform(df)
mean_imputer_target.imputer_dict_

[var for var in df.columns if df[var].isnull().sum() > 0]
df.rename(columns={"Games_&_Hobbies":"Games_Hobbies",
                   "Cosmetics_&_Self-care" : "Cosmetics_Self_Care",
                   }, inplace = True)


In [90]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Monthly_expenses_$'], axis=1), df['Monthly_expenses_$'], test_size = 0.1, random_state = 0)

In [91]:
X_train.columns

Index(['Gender', 'Age', 'Study_year', 'Living', 'Scholarship', 'Part_time_job',
       'Transporting', 'Smoking', 'Drinks', 'Games_Hobbies',
       'Cosmetics_Self_Care', 'Monthly_Subscription'],
      dtype='object')

# Config

In [92]:
VAR_WITH_NA = ['Study_year']
CAT_VARS_WITH_NA = ['Living', 'Part_time_job', 'Transporting', 'Smoking', 'Drinks', 'Cosmetics_Self_Care', 'Monthly_Subscription']
CAT_VARS = ['Gender', 'Living', 'Scholarship', 'Part_time_job', 'Transporting', 'Smoking', 'Drinks', 'Games_Hobbies', 'Cosmetics_Self_Care', 'Monthly_Subscription']




In [93]:
expense_pipe = Pipeline([
    
    #replacing nan in categorical variables with frequent values
    ('frequent_imputation', CategoricalImputer(
        imputation_method='frequent', variables=CAT_VARS_WITH_NA)),
    
    #adding missing indicator
    ('missing_indicator', AddMissingIndicator(variables=VAR_WITH_NA)),
    
    
    #replacing nan with mean in numerical var
    ('mean_imputation', MeanMedianImputer(
        imputation_method='mean', variables=VAR_WITH_NA
    )),
    
    #encoding categorical variables
    ('categorical_encoder', OrdinalEncoder(
        encoding_method='ordered', variables=CAT_VARS)),
    
    #XGBOOST
    ("XGBOOST_Regressor", xgb.XGBRegressor(random_state = 42) )
])


In [94]:
expense_pipe.fit(X_train, y_train)

In [95]:
[var for var in X_train.columns if X_train[var].isnull().sum() > 0]

['Study_year',
 'Living',
 'Part_time_job',
 'Transporting',
 'Smoking',
 'Drinks',
 'Cosmetics_Self_Care',
 'Monthly_Subscription']

In [96]:
[var for var in X_test.columns if X_test[var].isnull().sum() > 0]

['Study_year',
 'Part_time_job',
 'Transporting',
 'Smoking',
 'Drinks',
 'Cosmetics_Self_Care',
 'Monthly_Subscription']

In [97]:
predictions = expense_pipe.predict(X_test)

In [98]:
from sklearn.metrics import r2_score
r2_score(y_test, predictions)

0.9488825422408782

In [99]:
joblib.dump(expense_pipe, 'expense_pipe.joblib') 

['expense_pipe.joblib']

In [100]:
xxx = joblib.load('expense_pipe.joblib')

In [101]:
lorem = pd.DataFrame([['Female', 21, 2, 'Home', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No']], columns=['Gender', 'Age', 'Study_year', 'Living', 'Scholarship', 'Part_time_job',
       'Transporting', 'Smoking', 'Drinks', 'Games_Hobbies',
       'Cosmetics_Self_Care', 'Monthly_Subscription'])
xxx.predict(lorem)




array([150.00209], dtype=float32)