# Import modules

In [1]:
import numpy as np
import pandas as pd 
import scipy
from sklearn.linear_model import LinearRegression

## VIDEO 4

### Read in data

In [13]:
wine = pd.read_csv("wine.csv")
print(wine.head())
print(wine.dtypes)
print(wine.describe())
print(wine.shape)

   Year   Price  WinterRain     AGST  HarvestRain  Age  FrancePop
0  1952  7.4950         600  17.1167          160   31  43183.569
1  1953  8.0393         690  16.7333           80   30  43495.030
2  1955  7.6858         502  17.1500          130   28  44217.857
3  1957  6.9845         420  16.1333          110   26  45152.252
4  1958  6.7772         582  16.4167          187   25  45653.805

[5 rows x 7 columns]
Year             int64
Price          float64
WinterRain       int64
AGST           float64
HarvestRain      int64
Age              int64
FrancePop      float64
dtype: object
              Year      Price  WinterRain       AGST  HarvestRain        Age  \
count    25.000000  25.000000   25.000000  25.000000    25.000000  25.000000   
mean   1965.800000   7.067224  605.280000  16.509336   148.560000  17.200000   
std       7.691987   0.650341  132.277965   0.675397    74.419464   7.691987   
min    1952.000000   6.204900  376.000000  14.983300    38.000000   5.000000   
25%    

### Linear Regression (one variable)

In [9]:
feature_cols = ['AGST']
X = wine[feature_cols]
y = wine.Price
model1 = LinearRegression()
model1.fit(X, y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
print model1.intercept_
print model1.coef_


# Sum of Squared Errors
print(model1.score(X,y))

-3.41776131349
[ 0.63509431]
0.4350231678


### Linear Regression (two variables)

In [22]:
feature_cols = ['AGST', 'HarvestRain']
X = wine[feature_cols]
y = wine.Price
model2 = LinearRegression()
model2.fit(X, y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
print model2.intercept_
print model2.coef_

# Sum of Squared Errors
print(model2.score(X,y))

-2.20265360095
[ 0.60261691 -0.00457006]
0.707370766205


### Linear Regression (all variables)

In [23]:
feature_cols = ['AGST', 'HarvestRain', 'WinterRain', 'Age', 'FrancePop']
X = wine[feature_cols]
y = wine.Price
model3 = LinearRegression()
model3.fit(X, y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
print model3.intercept_
print model3.coef_

# Sum of Squared Errors
print(model3.score(X,y))

-0.450398864395
[  6.01223884e-01  -3.95812450e-03   1.04250681e-03   5.84748489e-04
  -4.95273038e-05]
0.82935922233


## VIDEO 5

### Remove FrancePop

In [24]:
feature_cols = ['AGST', 'HarvestRain', 'WinterRain', 'Age']
X = wine[feature_cols]
y = wine.Price
model4 = LinearRegression()
model4.fit(X, y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
print model4.intercept_
print model4.coef_

# Sum of Squared Errors
print(model4.score(X,y))

-3.42998018693
[ 0.60720935 -0.00397153  0.00107551  0.02393083]
0.828566219342


## VIDEO 6

### Correlations


In [30]:
print(wine.corr().ix['WinterRain', 'Price'])
print(wine.corr().ix['Age', 'FrancePop'])

#other way to calculate pandas.dataframe.corr 

print(wine['Age'].corr(wine['FrancePop'])) # default method 'pearson'
print(wine['Age'].corr(wine['FrancePop'], method='spearman'))
print(wine['Age'].corr(wine['FrancePop'], method='kendall'))
wine.corr()

0.136650547388
-0.994485097111
-0.994485097111
-1.0
-1.0


Unnamed: 0,Year,Price,WinterRain,AGST,HarvestRain,Age,FrancePop
Year,1.0,-0.447768,0.01697,-0.246916,0.028009,-1.0,0.994485
Price,-0.447768,1.0,0.136651,0.659563,-0.563322,0.447768,-0.466862
WinterRain,0.01697,0.136651,1.0,-0.321091,-0.275441,-0.01697,-0.001622
AGST,-0.246916,0.659563,-0.321091,1.0,-0.064496,0.246916,-0.259162
HarvestRain,0.028009,-0.563322,-0.275441,-0.064496,1.0,-0.028009,0.041264
Age,-1.0,0.447768,-0.01697,0.246916,-0.028009,1.0,-0.994485
FrancePop,0.994485,-0.466862,-0.001622,-0.259162,0.041264,-0.994485,1.0


### Remove Age and FrancePop
model5 = lm(Price ~ AGST + HarvestRain + WinterRain, data=wine)
summary(model5)

In [25]:
feature_cols = ['AGST', 'HarvestRain', 'WinterRain']
X = wine[feature_cols]
y = wine.Price
model5 = LinearRegression()
model5.fit(X, y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
print model5.intercept_
print model5.coef_

# Sum of Squared Errors
print(model5.score(X,y))

-4.30162626004
[ 0.68102417 -0.00394815  0.00117653]
0.753689442638


## VIDEO 7

### Read in test set


In [55]:
wineTest = pd.read_csv("wine_test.csv")
print(wineTest.head())
print(wineTest.dtypes)
print(wineTest.describe())
print(wineTest.shape)

   Year   Price  WinterRain     AGST  HarvestRain  Age  FrancePop
0  1979  6.9541         717  16.1667          122    4  54835.832
1  1980  6.4979         578  16.0000           74    3  55110.236

[2 rows x 7 columns]
Year             int64
Price          float64
WinterRain       int64
AGST           float64
HarvestRain      int64
Age              int64
FrancePop      float64
dtype: object
              Year     Price  WinterRain       AGST  HarvestRain       Age  \
count     2.000000  2.000000    2.000000   2.000000     2.000000  2.000000   
mean   1979.500000  6.726000  647.500000  16.083350    98.000000  3.500000   
std       0.707107  0.322582   98.287843   0.117875    33.941125  0.707107   
min    1979.000000  6.497900  578.000000  16.000000    74.000000  3.000000   
25%    1979.250000  6.611950  612.750000  16.041675    86.000000  3.250000   
50%    1979.500000  6.726000  647.500000  16.083350    98.000000  3.500000   
75%    1979.750000  6.840050  682.250000  16.125025   110.0

### Make test set predictions


In [62]:
feature_cols = ['AGST', 'HarvestRain', 'WinterRain', 'Age']
Test_X = wineTest[feature_cols]
Test_y = wineTest.Price

predictions = model4.predict(Test_X)

for i, prediction in enumerate(predictions):
    print 'Predicted: %s, Target: %s' % (prediction, Test_y[i])
    

Predicted: 6.76892462968, Target: 6.9541
Predicted: 6.68491040269, Target: 6.4979



### Compute R-squared

In [63]:
print 'R-squared: %.2f' % model4.score(Test_X, Test_y)

R-squared: 0.33
