# Variables categóricas en una regresión lineal

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv("python-ml-course-master/datasets/ecom-expense/Ecom Expense.csv")

In [3]:
df.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485


In [4]:
dummy_gender = pd.get_dummies(df["Gender"], prefix="Gender")
dummy_city_tier = pd.get_dummies(df["City Tier"], prefix="City")

In [5]:
dummy_gender.head()

Unnamed: 0,Gender_Female,Gender_Male
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


In [6]:
dummy_city_tier.head()

Unnamed: 0,City_Tier 1,City_Tier 2,City_Tier 3
0,1,0,0
1,0,1,0
2,0,1,0
3,1,0,0
4,0,1,0


In [7]:
df = df.join(dummy_gender)

In [8]:
df = df.join(dummy_city_tier)

In [9]:
df.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male,City_Tier 1,City_Tier 2,City_Tier 3
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0,1,0,0
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0,0,1,0
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1,0,1,0
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0,1,0,0
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0,0,1,0


In [10]:
feature_cols = ["Monthly Income","Transaction Time", 
                "Gender_Female", "Gender_Male", 
                "City_Tier 1", "City_Tier 2", "City_Tier 3"]

In [11]:
X = df[feature_cols]

In [12]:
Y = df["Total Spend"]

In [13]:
lm = LinearRegression()

In [14]:
lm.fit(X,Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [15]:
lm.intercept_

3655.7294076906533

In [16]:
lm.coef_

array([   0.15297825,    0.12372609,  -94.15779883,   94.15779883,
        119.6632516 ,  -16.67901801, -102.9842336 ])

In [17]:
list(zip(feature_cols,lm.coef_))

[('Monthly Income', 0.15297824609320512),
 ('Transaction Time', 0.12372608642619992),
 ('Gender_Female', -94.15779883032016),
 ('Gender_Male', 94.15779883032023),
 ('City_Tier 1', 119.66325160390109),
 ('City_Tier 2', -16.67901800799035),
 ('City_Tier 3', -102.98423359591068)]

In [18]:
lm.score(X,Y)
# Nos da un valor muy bajo.. agregamos otra de la variables

0.19478920552885381

In [19]:
feature_cols = ["Monthly Income","Transaction Time", 
                "Gender_Female", "Gender_Male", 
                "City_Tier 1", "City_Tier 2", "City_Tier 3",
               "Record"]

In [20]:
X = df[feature_cols]
Y = df["Total Spend"]
lm.fit(X,Y)
lm.score(X,Y)

0.9179923586131016

In [21]:
list(zip(feature_cols,lm.coef_))

[('Monthly Income', 0.14753898049205738),
 ('Transaction Time', 0.15494612549589634),
 ('Gender_Female', -131.02501325554624),
 ('Gender_Male', 131.02501325554607),
 ('City_Tier 1', 76.76432601049513),
 ('City_Tier 2', 55.1389743092325),
 ('City_Tier 3', -131.9033003197277),
 ('Record', 772.2334457445645)]

In [22]:
# Probamos agregando la edad
feature_cols = ["Monthly Income","Transaction Time", 
                "Gender_Female", "Gender_Male", 
                "City_Tier 1", "City_Tier 2", "City_Tier 3",
               "Record", "Age "]
X = df[feature_cols]
Y = df["Total Spend"]
lm.fit(X,Y)
lm.score(X,Y)

0.9187458997709432

In [23]:
list(zip(feature_cols,lm.coef_))

[('Monthly Income', 0.14744226897448584),
 ('Transaction Time', 0.15639158306366313),
 ('Gender_Female', -133.0887066317063),
 ('Gender_Male', 133.08870663170583),
 ('City_Tier 1', 78.37850497640308),
 ('City_Tier 2', 52.02596334431941),
 ('City_Tier 3', -130.40446832072254),
 ('Record', 772.1492053631358),
 ('Age ', 6.424298167612881)]

In [24]:
lm.intercept_

-335.73800174537246

In [25]:
# Nos quedamos con el anterior sin la edad
list(zip(feature_cols,lm.coef_))

[('Monthly Income', 0.14744226897448584),
 ('Transaction Time', 0.15639158306366313),
 ('Gender_Female', -133.0887066317063),
 ('Gender_Male', 133.08870663170583),
 ('City_Tier 1', 78.37850497640308),
 ('City_Tier 2', 52.02596334431941),
 ('City_Tier 3', -130.40446832072254),
 ('Record', 772.1492053631358),
 ('Age ', 6.424298167612881)]

# Con lo cual nuestro modelo podría ser escrito como:
Total Spend = -79.4171303013718 + 0.14753898049205738 Monthly Income + 0.15494612549589634 Transaction Time + -131.02501325554624 Gender_Female + 131.02501325554607 Gender_Male + 76.76432601049513 City_Tier 1 + 55.1389743092325 City_Tier 2 + -131.9033003197277 City_Tier 3 + 772.2334457445645 Record

In [26]:
df["prediction"] = -79.4171303013718 + 0.14753898049205738 * df["Monthly Income"] + 0.15494612549589634 * df["Transaction Time"] + -131.02501325554624 * df["Gender_Female"] + 131.02501325554607 * df["Gender_Male"] + 76.76432601049513 * df["City_Tier 1"] + 55.1389743092325 * df["City_Tier 2"] + -131.9033003197277 * df["City_Tier 3"] + 772.2334457445645 * df["Record"]

In [27]:
df.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male,City_Tier 1,City_Tier 2,City_Tier 3,prediction
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0,1,0,0,4903.69672
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0,0,1,0,4799.434826
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1,0,1,0,5157.082504
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0,1,0,0,8068.012996
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0,0,1,0,3581.980335


In [28]:
df["predict_lm"] = lm.predict(df[feature_cols])

In [29]:
df.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male,City_Tier 1,City_Tier 2,City_Tier 3,prediction,predict_lm
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0,1,0,0,4903.69672,4916.525671
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0,0,1,0,4799.434826,4690.334781
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1,0,1,0,5157.082504,5200.539037
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0,1,0,0,8068.012996,8130.623235
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0,0,1,0,3581.980335,3704.958811


In [30]:
df.columns.values

array(['Transaction ID', 'Age ', ' Items ', 'Monthly Income',
       'Transaction Time', 'Record', 'Gender', 'City Tier', 'Total Spend',
       'Gender_Female', 'Gender_Male', 'City_Tier 1', 'City_Tier 2',
       'City_Tier 3', 'prediction', 'predict_lm'], dtype=object)

In [31]:
SSD = np.sum( (df["prediction"] - df["Total Spend"]) ** 2)

In [32]:
SSD

1517733985.340816

In [33]:
RSE = np.sqrt(SSD / (len(df) - len(feature_cols) - 1) )

In [34]:
RSE

803.302596656182

In [35]:
 error = RSE / df["Total Spend"].mean()

In [36]:
error
# un error del 13%~

0.13033905610324445

#  Enmascarado de variables categóricas redundantes

In [37]:
dummy_gender = pd.get_dummies(df["Gender"], prefix="Gender").iloc[:,1:]

In [38]:
dummy_gender.head()

Unnamed: 0,Gender_Male
0,0
1,0
2,1
3,0
4,0


In [39]:
dummy_city_tier = pd.get_dummies(df["City Tier"], prefix="City").iloc[:,1:]

In [40]:
dummy_city_tier.head()

Unnamed: 0,City_Tier 2,City_Tier 3
0,0,0
1,1,0
2,1,0
3,0,0
4,1,0


# habria que cargar nuevamente el df y join con estos dos nuevos df
# pero soy vago y borro los _1

In [41]:
df.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male,City_Tier 1,City_Tier 2,City_Tier 3,prediction,predict_lm
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0,1,0,0,4903.69672,4916.525671
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0,0,1,0,4799.434826,4690.334781
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1,0,1,0,5157.082504,5200.539037
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0,1,0,0,8068.012996,8130.623235
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0,0,1,0,3581.980335,3704.958811


In [44]:
df.drop(["Gender_Female"], axis=1, inplace=True)

In [45]:
df.drop(["City_Tier 1"], axis=1, inplace=True)

In [46]:
feature_cols = ["Monthly Income","Transaction Time", 
                "Gender_Male", 
                "City_Tier 2", "City_Tier 3",
               "Record"]

In [47]:
X = df[feature_cols]
Y = df["Total Spend"]
lm.fit(X,Y)
lm.score(X,Y)

0.9179923586131016

In [48]:
list(zip(feature_cols,lm.coef_))

[('Monthly Income', 0.14753898049205744),
 ('Transaction Time', 0.1549461254959002),
 ('Gender_Male', 262.0500265110948),
 ('City_Tier 2', -21.62535170126276),
 ('City_Tier 3', -208.66762633022296),
 ('Record', 772.2334457445636)]

In [49]:
lm.intercept_

-133.67781754642238

Antes
* ('Monthly Income', 0.14753898049205738),
* ('Transaction Time', 0.15494612549589634),
* ('Gender_Female', -131.02501325554624),
* ('Gender_Male', 131.02501325554607),
* ('City_Tier 1', 76.76432601049513),
* ('City_Tier 2', 55.1389743092325),
* ('City_Tier 3', -131.9033003197277),
* ('Record', 772.2334457445645)]

# Los únicos que cambian son los de las variables dummy
* Gender Female: antes -131.02 --> ahora 0
* Gender Male: antes 131.02 --> ahora 131.02 - (-131.02) : 262.05
* City_Tier 1: Antes 76.76 --> ahora 0
* City_Tier 2: Antes 55.13 --> ahora 55.13 - 76.76 : -21.62
* City_Tier 3: Antes  -131.90 --> ahora  -131.90 - (- 76.76) :  -208.66