# El tratamiento de las variables categoricas

In [1]:
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("../../datasets/ecom-expense/Ecom Expense.csv")
df.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485


Entre las variables asociadas a una transaccion bancaria tenemos dos variables no numéricas, dos variables **categóricas**: el sexo y el barrio de la ciudad.

### pd.get_dummies
Para tenerlas en cuenta en la simulación lo que se hace es usar variables *Dummy*: variables que toman valor 1 cuando es una de las categorías y cero en las demas. Con la funcion get_dummies de pandas, pasandole la columna de la variable categórica nos la desdobla en tantas columnas como categorías haya y nos la puebla con 1 o 0

In [11]:
dummy_gender = pd.get_dummies(df["Gender"], prefix = "Gender")
dummy_city_tier = pd.get_dummies(df["City Tier"], prefix = "City")

In [12]:
dummy_gender.head()

Unnamed: 0,Gender_Female,Gender_Male
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


In [7]:
dummy_city_tier.head()

Unnamed: 0,City_Tier 1,City_Tier 2,City_Tier 3
0,1,0,0
1,0,1,0
2,0,1,0
3,1,0,0
4,0,1,0


In [16]:
df_new = pd.concat([df, dummy_gender, dummy_city_tier], axis=1, sort=False)
df_new.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male,City_Tier 1,City_Tier 2,City_Tier 3
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0,1,0,0
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0,0,1,0
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1,0,1,0
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0,1,0,0
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0,0,1,0


## Creación del modelo
### Preparacion de los recordsets X e Y
necesito 
- X: un df con las columnas predictoras 
- Y: un df con la columna a predecir

In [17]:
df_new.columns

Index(['Transaction ID', 'Age ', ' Items ', 'Monthly Income',
       'Transaction Time', 'Record', 'Gender', 'City Tier', 'Total Spend',
       'Gender_Female', 'Gender_Male', 'City_Tier 1', 'City_Tier 2',
       'City_Tier 3'],
      dtype='object')

In [18]:
feature_cols = ['Monthly Income',
       'Transaction Time',
       'Gender_Female', 'Gender_Male', 'City_Tier 1', 'City_Tier 2',
       'City_Tier 3']

In [19]:
X = df_new[feature_cols]
Y = df_new["Total Spend"]

### Preparacion del modelo 
Y = a0 + a1x1 + a2x2 + ...
- Ajuste con LinearRegression.fit del modelo
- Extraccion del corte con el eje (a0) con intercept_
- Extraccion del resto de coefs (a1... an) con coef_

Para combinar cada coef con su columna podemos zipearlos.

In [21]:
lm = LinearRegression()
lm.fit(X,Y)
print (lm.intercept_)
print (lm.coef_)

3655.7294076906537
[   0.15297825    0.12372609  -94.15779883   94.15779883  119.6632516
  -16.67901801 -102.9842336 ]


In [24]:
list(zip(feature_cols, lm.coef_))

[('Monthly Income', 0.15297824609320512),
 ('Transaction Time', 0.1237260864262),
 ('Gender_Female', -94.15779883032016),
 ('Gender_Male', 94.15779883032022),
 ('City_Tier 1', 119.66325160390089),
 ('City_Tier 2', -16.679018007990205),
 ('City_Tier 3', -102.98423359591068)]

### Evaluación de la bondad del modelo
- factor de r^2. CUanto más cerca de 1 mejor


In [25]:
lm.score(X,Y)

0.19478920552885381

### añadimos una variable: record
- Definimos nuevo X

In [27]:
feature_cols = ['Monthly Income', 'Transaction Time', 
       'Record',
       'Gender_Female', 'Gender_Male', 'City_Tier 1', 'City_Tier 2', 'City_Tier 3']
X = df_new[feature_cols]



- preparamos el modelo con linearregression.fit
- Comprobamos bondad

In [29]:
lm = LinearRegression()
lm.fit(X,Y)
lm.score(X,Y)

0.9179923586131016

### Añadimos edad
'Age ': chequear la lista de las columnas
- definimos nuevo X
- preparamos el modelo con LinearRegression.fit 
- comprobamos bondad

In [34]:
feature_cols = ['Monthly Income', 'Transaction Time', 'Record',
        'Age ',
       'Gender_Female', 'Gender_Male', 'City_Tier 1', 'City_Tier 2', 'City_Tier 3']
X = df_new[feature_cols]
lm = LinearRegression()
lm.fit(X,Y)
lm.score(X,Y)

0.9187458997709432

In [None]:
### Quitamos transaction time
- definimos nuevo X
- preparamos el modelo con LinearRegression.fit 
- comprobamos bondad

In [35]:
feature_cols = ['Monthly Income',  'Record',
        'Age ',
       'Gender_Female', 'Gender_Male', 'City_Tier 1', 'City_Tier 2', 'City_Tier 3']
X = df_new[feature_cols]
lm = LinearRegression()
lm.fit(X,Y)
lm.score(X,Y)

0.9184914879534127

In [36]:
print (lm.intercept_)
list(zip(feature_cols, lm.coef_))

-252.92285444333083


[('Monthly Income', 0.1471926295577822),
 ('Record', 772.0952374800315),
 ('Age ', 6.404251233410378),
 ('Gender_Female', -134.82323175539614),
 ('Gender_Male', 134.82323175539602),
 ('City_Tier 1', 78.7493200419608),
 ('City_Tier 2', 51.6705421548436),
 ('City_Tier 3', -130.41986219680444)]

Nos quedamos con este, 8 variables

Total_Spend = -252.92285444333083+ 'Monthly Income' * 0.1471926295577822 +  'Record' * 772.0952374800315 +  'Age ' * 6.404251233410378 +
 'Gender_Female' * -134.82323175539614 +
 'Gender_Male' * 134.82323175539602 +
 'City_Tier 1' * 78.7493200419608 +
 'City_Tier 2' * 51.6705421548436 +
 'City_Tier 3' * -130.41986219680444

 Entendiendo el modelo

 - Hombre de tier 1
Total_Spend = -252.92285444333083+ 'Monthly Income' * 0.1471926295577822 +  'Record' * 772.0952374800315 +  'Age ' * 6.404251233410378 + 134.82323175539602 + 78.7493200419608
- Hombre de tier 2
Total_Spend = -252.92285444333083+ 'Monthly Income' * 0.1471926295577822 +  'Record' * 772.0952374800315 +  'Age ' * 6.404251233410378 + 134.82323175539602 + 51.6705421548436
- Hombre del tier 3
Total_Spend = -252.92285444333083+ 'Monthly Income' * 0.1471926295577822 +  'Record' * 772.0952374800315 +  'Age ' * 6.404251233410378 + 134.82323175539602 -130.41986219680444

 - Mujer de tier 1
Total_Spend = -252.92285444333083+ 'Monthly Income' * 0.1471926295577822 +  'Record' * 772.0952374800315 +  'Age ' * 6.404251233410378 - 134.82323175539602 + 78.7493200419608
- Mujer de tier 2
Total_Spend = -252.92285444333083+ 'Monthly Income' * 0.1471926295577822 +  'Record' * 772.0952374800315 +  'Age ' * 6.404251233410378 - 134.82323175539602 + 51.6705421548436
- Mujer del tier 3
Total_Spend = -252.92285444333083+ 'Monthly Income' * 0.1471926295577822 +  'Record' * 772.0952374800315 +  'Age ' * 6.404251233410378 - 134.82323175539602 -130.41986219680444


## Chequeo del Modelo
Chequeamos a mano. 
- Calculamos la columna predecida *con el modelo* lm.predict
- suma de los cuadrados de las diferencias (SSD)
- desviación típica de los residuos


In [57]:
### Esto es una burrada. Chicos no lo hagais en casa
df_new["prediction_caca"] = -252.92285444333083 + df_new['Monthly Income'] * 0.1471926295577822 + df_new['Record'] * 772.0952374800315 + df_new['Age '] * 6.404251233410378 - df_new['Gender_Female'] * 134.82323175539614 + df_new['Gender_Male'] * 134.82323175539602 + df_new['City_Tier 1'] * 78.7493200419608 + df_new['City_Tier 2'] * 51.6705421548436 - df_new['City_Tier 3'] * 130.41986219680444

## esto es lo correcto
df_new["prediction"] = lm.predict(df_new[feature_cols])


In [58]:
df_new.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male,City_Tier 1,City_Tier 2,City_Tier 3,prediction,RSE,prediction_caca
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0,1,0,0,4896.877673,698.492589,4896.877673
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0,0,1,0,4746.139795,611.163147,4746.139795
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1,0,1,0,5141.376825,-25.23763,5141.376825
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0,1,0,0,8146.600121,362.152445,8146.600121
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0,0,1,0,3717.684383,463.523898,3717.684383


In [60]:
df_new["RSE"] = df_new["prediction"]-df_new["Total Spend"]
SSD1 = sum(df_new["RSE"]**2)

SSD = np.sum((df_new["prediction"]-df_new["Total Spend"])**2)
SSD, SSD1

(1508496485.6388445, 1508496485.6388407)

In [61]:
# los euros en los que nos equivocamos al predecir
RSE = np.sqrt(SSD/(len(df_new)-len(feature_cols)-1))
RSE

800.6840694194143

In [62]:
# Los euros que de media se gasta la peña
spend_mean = np.mean(df_new["Total Spend"])
spend_mean

6163.176415976715

In [63]:
error = (RSE/spend_mean)*100
print (f"{error}%")

12.991418959610051%
