### Categorical Features

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv('../datasets/ecom-expense/Ecom Expense.csv')
df.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485


In [3]:
# Gender y City Tier tienen que convertirse en variables categóricas
dummy_gender = pd.get_dummies(df['Gender'],prefix='Gender',prefix_sep='_')
dummy_city_tier = pd.get_dummies(df['City Tier'],prefix='City',prefix_sep='_')

In [4]:
dummy_city_tier.head()

Unnamed: 0,City_Tier 1,City_Tier 2,City_Tier 3
0,1,0,0
1,0,1,0
2,0,1,0
3,1,0,0
4,0,1,0


In [5]:
df = df.join(dummy_gender)
df = df.join(dummy_city_tier)
df.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male,City_Tier 1,City_Tier 2,City_Tier 3
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0,1,0,0
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0,0,1,0
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1,0,1,0
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0,1,0,0
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0,0,1,0


In [6]:
# Vamos a escoger estas en principio
# Si queremos añadir alguna más podemos hacerlo como en vídeos anteriores

features_cols = ['Monthly Income','Transaction Time','Gender_Female','Gender_Male','City_Tier 1','City_Tier 2','City_Tier 3']

In [7]:
# Features and Response
X = df[features_cols]
y = df['Total Spend']

In [8]:
# Instanciamos y Entrenamos Modelo
linreg = LinearRegression()
linreg.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [9]:
# Intercept and Features Coefficients
print('Intercept: %s'%(linreg.intercept_))
print('Coefficients: %s'%(linreg.coef_))

Intercept: 3655.729407690654
Coefficients: [   0.15297825    0.12372609  -94.15779883   94.15779883  119.6632516
  -16.67901801 -102.9842336 ]


In [10]:
list(zip(features_cols,linreg.coef_))

[('Monthly Income', 0.1529782460932051),
 ('Transaction Time', 0.1237260864262002),
 ('Gender_Female', -94.15779883032023),
 ('Gender_Male', 94.15779883032025),
 ('City_Tier 1', 119.66325160390122),
 ('City_Tier 2', -16.679018007990468),
 ('City_Tier 3', -102.98423359591075)]

In [11]:
# Muy bajo
linreg.score(X,y)

0.19478920552885381

In [12]:
# Añado Record al modelo (Record es el número de transacciones al mes)
features_cols = ['Monthly Income','Transaction Time','Record','Gender_Female','Gender_Male','City_Tier 1','City_Tier 2','City_Tier 3']
# Features and Response
X = df[features_cols]
y = df['Total Spend']
# Instanciamos y Entrenamos Modelo
linreg = LinearRegression()
linreg.fit(X,y)
print(list(zip(features_cols,linreg.coef_)))
# R^2-Score
linreg.score(X,y)

[('Monthly Income', 0.14753898049205733), ('Transaction Time', 0.1549461254958988), ('Record', 772.233445744565), ('Gender_Female', -131.0250132555464), ('Gender_Male', 131.0250132555464), ('City_Tier 1', 76.76432601049525), ('City_Tier 2', 55.1389743092325), ('City_Tier 3', -131.9033003197277)]


0.9179923586131016

__R^2 ha mejorado mucho pasando de 0.1947 a 0.9179__

In [13]:
# Añado edad al modelo
features_cols = ['Monthly Income','Transaction Time','Age ','Record','Gender_Female','Gender_Male','City_Tier 1','City_Tier 2','City_Tier 3']
# Features and Response
X = df[features_cols]
y = df['Total Spend']
# Instanciamos y Entrenamos Modelo
linreg = LinearRegression()
linreg.fit(X,y)
print(list(zip(features_cols,linreg.coef_)))
# R^2-Score
linreg.score(X,y)

[('Monthly Income', 0.14744226897448595), ('Transaction Time', 0.15639158306366374), ('Age ', 6.424298167612837), ('Record', 772.1492053631354), ('Gender_Female', -133.08870663170674), ('Gender_Male', 133.0887066317067), ('City_Tier 1', 78.37850497640319), ('City_Tier 2', 52.02596334431932), ('City_Tier 3', -130.4044683207225)]


0.9187458997709432

__Apenas ha subido el R^2__

In [14]:
df.columns.values

array(['Transaction ID', 'Age ', ' Items ', 'Monthly Income',
       'Transaction Time', 'Record', 'Gender', 'City Tier', 'Total Spend',
       'Gender_Female', 'Gender_Male', 'City_Tier 1', 'City_Tier 2',
       'City_Tier 3'], dtype=object)

In [23]:
# Añado Record al modelo y quito edad
features_cols = ['Monthly Income','Transaction Time','Record','Gender_Female','Gender_Male','City_Tier 1','City_Tier 2','City_Tier 3']
# Features and Response
X = df[features_cols]
y = df['Total Spend']
# Instanciamos y Entrenamos Modelo
linreg = LinearRegression()
linreg.fit(X,y)
print(list(zip(features_cols,linreg.coef_)))
# R^2-Score
linreg.score(X,y)

[('Monthly Income', 0.14753898049205733), ('Transaction Time', 0.1549461254958988), ('Record', 772.233445744565), ('Gender_Female', -131.0250132555464), ('Gender_Male', 131.0250132555464), ('City_Tier 1', 76.76432601049525), ('City_Tier 2', 55.1389743092325), ('City_Tier 3', -131.9033003197277)]


0.9179923586131016

In [24]:
spend_pred = linreg.predict(X)

In [25]:
df['Prediction'] = spend_pred

In [26]:
df.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male,City_Tier 1,City_Tier 2,City_Tier 3,Prediction
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0,1,0,0,4903.69672
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0,0,1,0,4799.434826
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1,0,1,0,5157.082504
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0,1,0,0,8068.012996
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0,0,1,0,3581.980335


In [27]:
SSD = np.sum((df['Total Spend']-df['Prediction'])**2)
SSD

1517733985.3408158

In [28]:
RSE = np.sqrt(SSD/(len(df)-len(features_cols)-1))
RSE

803.1318809818164

In [29]:
spend_mean = np.mean(df['Total Spend'])
spend_mean

6163.176415976714

In [30]:
error = RSE/spend_mean
error

0.1303113568029416

# Variables categoricas con k categorias se pueden explicar como k - 1 columnas

El cambio repercutirá en el INTERCEPT

In [31]:
# Quito una de Genre y uno de Citi Tier
features_cols = ['Monthly Income','Transaction Time','Record','Gender_Male','City_Tier 1','City_Tier 2']
# Features and Response
X = df[features_cols]
y = df['Total Spend']
# Instanciamos y Entrenamos Modelo
linreg = LinearRegression()
linreg.fit(X,y)
print(list(zip(features_cols,linreg.coef_)))
# R^2-Score
linreg.score(X,y)

[('Monthly Income', 0.14753898049205735), ('Transaction Time', 0.15494612549589304), ('Record', 772.2334457445638), ('Gender_Male', 262.0500265110924), ('City_Tier 1', 208.66762633022293), ('City_Tier 2', 187.04227462896034)]


0.9179923586131016

In [32]:
SSD = np.sum((df['Total Spend']-df['Prediction'])**2)
SSD

1517733985.3408158

In [33]:
RSE = np.sqrt(SSD/(len(df)-len(features_cols)-1))
RSE

802.7907758991203

In [34]:
spend_mean = np.mean(df['Total Spend'])
spend_mean

6163.176415976714

In [35]:
error = RSE/spend_mean
error

0.13025601114030375