# LASSO Regression Example
## Created by Anton Lipkanou
### For solely private usage

In [3]:
import pandas as pd
import pandas
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LassoLarsCV
import matplotlib.pyplot as plt
import sklearn

#Data can be found in the Github repo
alldata = pd.read_csv('finalmaster-ratios.csv')
allvariablenames = list(alldata.columns.values)

listofallpredictors = allvariablenames;



In [4]:
print (listofallpredictors)

['# Purchases', 'B01001001', 'B01001002', 'B01001003', 'B01001004', 'B01001005', 'B01001006', 'B01001007', 'B01001008', 'B01001009', 'B01001010', 'B01001011', 'B01001012', 'B01001013', 'B01001014', 'B01001015', 'B01001016', 'B01001017', 'B01001018', 'B01001019', 'B01001020', 'B01001021', 'B01001022', 'B01001023', 'B01001024', 'B01001025', 'B01001026', 'B01001027', 'B01001028', 'B01001029', 'B01001030', 'B01001031', 'B01001032', 'B01001033', 'B01001034', 'B01001035', 'B01001036', 'B01001037', 'B01001038', 'B01001039', 'B01001040', 'B01001041', 'B01001042', 'B01001043', 'B01001044', 'B01001045', 'B01001046', 'B01001047', 'B01001048', 'B01001049', 'B02001001', 'B02001002', 'B02001003', 'B02001004', 'B02001005', 'B02001006', 'B02001007', 'B02001008', 'B02001009', 'B02001010', 'B12001001', 'B12001002', 'B12001003', 'B12001004', 'B12001005', 'B12001006', 'B12001007', 'B12001008', 'B12001009', 'B12001010', 'B12001011', 'B12001012', 'B12001013', 'B12001014', 'B12001015', 'B12001016', 'B1200101

In [5]:
for i in range (0, 8):
    listofallpredictors.pop(0)
    
#load predictors into dataframe
predictors = alldata[listofallpredictors]  

#load target into dataframe
target = alldata['# Purchases'] 
    
# split data into train and test sets, with 30% retained for test
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, target, test_size=.3, random_state=123)    

model = LassoLarsCV(precompute=False, cv=10)
model.fit(pred_train, tar_train)

#build coefficent chart
predictors_model=pd.DataFrame(listofallpredictors) #We are converting our list into Panda DataFrame
predictors_model.columns = ['label'] #We are making each variable name the label / name of the column
predictors_model['coeff'] = model.coef_ #We are adding the resulting coefficient from the model for the respective variable into the dataset

for index, row in predictors_model.iterrows(): #we are printing all the significant coefficients (>0)
    if row['coeff'] > 0:
        print(row.values)

print("Question/Answer:")
print ("In the areas where there are more males from 40 to 44 years, we sell a bit more Bobo Bars")
print ("In the areas where there are more females from 30 to 34 years, we sell more Bobo Bars")
print ("In the areas where there are more females from 35 to 39 years, we sell a bit more Bobo Bars")
print ("In the areas where there are more females from 40 to 44 years, we sell more Bobo Bars")
print ("In the areas where there are more Asians, we sell a bit more Bobo Bars")
print ("In the areas where there are more people with BAchelors degree, we sell a bit more Bobo Bars")
print ("In the areas where there are more peopel with the Professional or Grad degree, we sell a bit more Bobo Bars")
print ("In the areas where there are women from 15 to 50, we sell tremendously more Bobo Bars")
print ("In the areas where there are people with the HHI of $200k+, we sell more Bobo Bars")

print ("Most important predictors are the presence of women from 15 to 50 in")
print ("the area and the quantity of women within the 30 to 34 age range")



['B01001014' 0.8557908775529921]
['B01001036' 2.505392496591849]
['B01001037' 0.8894214357013622]
['B01001038' 1.5315839680821497]
['B02001005' 0.4125408937426837]
['B13014026' 0.4800240326923769]
['B13014027' 0.6977454940063235]
['B13016001' 874922971.7249781]
['B19001017' 1.4834465563617387]
Question/Answer:
In the areas where there are more males from 40 to 44 years, we sell a bit more Bobo Bars
In the areas where there are more females from 30 to 34 years, we sell more Bobo Bars
In the areas where there are more females from 35 to 39 years, we sell a bit more Bobo Bars
In the areas where there are more females from 40 to 44 years, we sell more Bobo Bars
In the areas where there are more Asians, we sell a bit more Bobo Bars
In the areas where there are more people with BAchelors degree, we sell a bit more Bobo Bars
In the areas where there are more peopel with the Professional or Grad degree, we sell a bit more Bobo Bars
In the areas where there are women from 15 to 50, we sell trem



In [6]:
#building the predictive model
train_error = sklearn.metrics.mean_squared_error(tar_train, model.predict(pred_train))
print ('training data MSE')
print(train_error)


test_error = sklearn.metrics.mean_squared_error(tar_test, model.predict(pred_test))
print ('test data MSE')
print(test_error)

print("Question/Answer")
print("MSE on the test set is much bigger than on the train set")
print("It might mean that the model basically overfits")

training data MSE
22025.312777378716
test data MSE
41549.12573000182
Question/Answer
MSE on the test set is much bigger than on the train set
It might mean that the model basically overfits


In [7]:
#r squared
rsquared_train=model.score(pred_train,tar_train)
print ('training data R-square')
print(rsquared_train)

training data R-square
0.24002827375880997


In [8]:
#r squared
rsquared_test=model.score(pred_test,tar_test)
print ('test data R-square')
print(rsquared_test)

test data R-square
0.17587122769388464


In [9]:
print("Question/Answer")
print ("In general, the model predicts not very well. The R-squared of 17.6%")
print ("on the test set should be considered as pretty low")      

Question/Answer
In general, the model predicts not very well. The R-squared of 17.6%
on the test set should be considered as pretty low


In [10]:
print("y interecept:")
print(model.intercept_)
print("Question/Answer")
print ("With the intercept of 22.2, we can expect the annual sales in the ")
print("average area without the high presence of the groups discussed above")
print("to be at the leevl of roughly 22 per year.")   

y interecept:
22.194697684317433
Question/Answer
With the intercept of 22.2, we can expect the annual sales in the 
average area without the high presence of the groups discussed above
to be at the leevl of roughly 22 per year.
