# Linear Regression Predictions

In [170]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [171]:
fb = pd.read_csv('dataset_facebook.csv', delimiter=';')

In [172]:
fb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 19 columns):
Page total likes                                                       500 non-null int64
Type                                                                   500 non-null object
Category                                                               500 non-null int64
Post Month                                                             500 non-null int64
Post Weekday                                                           500 non-null int64
Post Hour                                                              500 non-null int64
Paid                                                                   499 non-null float64
Lifetime Post Total Reach                                              500 non-null int64
Lifetime Post Total Impressions                                        500 non-null int64
Lifetime Engaged Users                                                 500 non-nul

In [173]:
fb.columns = fb.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

In [174]:
fb.describe()

Unnamed: 0,page_total_likes,category,post_month,post_weekday,post_hour,paid,lifetime_post_total_reach,lifetime_post_total_impressions,lifetime_engaged_users,lifetime_post_consumers,lifetime_post_consumptions,lifetime_post_impressions_by_people_who_have_liked_your_page,lifetime_post_reach_by_people_who_like_your_page,lifetime_people_who_have_liked_your_page_and_engaged_with_your_post,comment,like,share,total_interactions
count,500.0,500.0,500.0,500.0,500.0,499.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,499.0,496.0,500.0
mean,123194.176,1.88,7.038,4.15,7.84,0.278557,13903.36,29585.95,920.344,798.772,1415.13,16766.38,6585.488,609.986,7.482,177.945892,27.266129,212.12
std,16272.813214,0.852675,3.307936,2.030701,4.368589,0.448739,22740.78789,76803.25,985.016636,882.505013,2000.594118,59791.02,7682.009405,612.725618,21.18091,323.398742,42.613292,380.233118
min,81370.0,1.0,1.0,1.0,1.0,0.0,238.0,570.0,9.0,9.0,9.0,567.0,236.0,9.0,0.0,0.0,0.0,0.0
25%,112676.0,1.0,4.0,2.0,3.0,0.0,3315.0,5694.75,393.75,332.5,509.25,3969.75,2181.5,291.0,1.0,56.5,10.0,71.0
50%,129600.0,2.0,7.0,4.0,9.0,0.0,5281.0,9051.0,625.5,551.5,851.0,6255.5,3417.0,412.0,3.0,101.0,19.0,123.5
75%,136393.0,3.0,10.0,6.0,11.0,1.0,13168.0,22085.5,1062.0,955.5,1463.0,14860.5,7989.0,656.25,7.0,187.5,32.25,228.5
max,139441.0,3.0,12.0,7.0,23.0,1.0,180480.0,1110282.0,11452.0,11328.0,19779.0,1107833.0,51456.0,4376.0,372.0,5172.0,790.0,6334.0


In [175]:
formula = 'share ~ like + comment'

linear_model = smf.ols(formula=formula, data=fb).fit()
linear_model.summary()

0,1,2,3
Dep. Variable:,share,R-squared:,0.859
Model:,OLS,Adj. R-squared:,0.858
Method:,Least Squares,F-statistic:,1496.0
Date:,"Fri, 22 Nov 2019",Prob (F-statistic):,4.0400000000000005e-210
Time:,14:53:14,Log-Likelihood:,-2079.3
No. Observations:,496,AIC:,4165.0
Df Residuals:,493,BIC:,4177.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,7.7032,0.836,9.214,0.000,6.060,9.346
like,0.0779,0.004,19.092,0.000,0.070,0.086
comment,0.7462,0.062,12.000,0.000,0.624,0.868

0,1,2,3
Omnibus:,95.127,Durbin-Watson:,1.696
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1809.729
Skew:,0.007,Prob(JB):,0.0
Kurtosis:,12.358,Cond. No.,430.0


## One point

In [176]:
indices = np.random.random_integers(0,len(fb),1)
test_data = fb.iloc[indices].drop('share', axis=1)
test_data

  """Entry point for launching an IPython kernel.


Unnamed: 0,page_total_likes,type,category,post_month,post_weekday,post_hour,paid,lifetime_post_total_reach,lifetime_post_total_impressions,lifetime_engaged_users,lifetime_post_consumers,lifetime_post_consumptions,lifetime_post_impressions_by_people_who_have_liked_your_page,lifetime_post_reach_by_people_who_like_your_page,lifetime_people_who_have_liked_your_page_and_engaged_with_your_post,comment,like,total_interactions
395,107907,Photo,1,4,6,9,0.0,3544,6156,509,482,701,5011,2746,387,1,51.0,58


In [177]:
linear_model.predict(test_data)

395    12.420345
dtype: float64

### And this is how it's doing it

prediction = intercept + coef_like * like + coef_comment * comment

ie.  y = b0 + b1 * x1 + b2 * x2

In [178]:
coefs = linear_model.params
coefs

Intercept    7.703172
like         0.077862
comment      0.746209
dtype: float64

In [181]:
prediction = coefs[0]+ coefs[1]*test_data['like'] + coefs[2]*test_data['comment']
prediction

395    12.420345
dtype: float64

# Multiple points!

In [183]:
# grab some random data points
# IF WE HAD NEW DATA, WE'D USE THAT INSTEAD
indices = np.random.random_integers(0,len(fb),10)
test_data = fb.iloc[indices].drop('share', axis=1)
test_data

  """Entry point for launching an IPython kernel.


Unnamed: 0,page_total_likes,type,category,post_month,post_weekday,post_hour,paid,lifetime_post_total_reach,lifetime_post_total_impressions,lifetime_engaged_users,lifetime_post_consumers,lifetime_post_consumptions,lifetime_post_impressions_by_people_who_have_liked_your_page,lifetime_post_reach_by_people_who_like_your_page,lifetime_people_who_have_liked_your_page_and_engaged_with_your_post,comment,like,total_interactions
308,124940,Photo,3,6,2,13,0.0,22464,29650,1029,934,1408,9376,6864,546,3,142.0,163
403,107907,Link,1,4,2,6,0.0,70912,94172,1374,1106,1267,42338,27232,788,1,379.0,420
197,133594,Photo,2,8,5,13,0.0,1920,3124,365,331,428,2541,1519,251,1,52.0,60
99,137020,Photo,1,10,4,10,0.0,12776,21893,785,539,881,13272,7800,575,12,328.0,430
228,131728,Photo,2,7,4,3,0.0,14824,21863,868,591,966,13498,8560,650,64,367.0,456
74,137893,Video,1,11,3,11,0.0,13544,30235,517,458,667,26622,11760,447,2,99.0,114
245,130791,Photo,1,7,2,13,0.0,44464,66824,1052,930,1571,22904,14080,559,4,154.0,188
383,109670,Photo,3,4,6,3,0.0,4986,8198,662,614,788,5685,3230,469,2,92.0,122
294,125612,Photo,1,6,2,13,0.0,3600,5807,691,656,913,3351,2110,436,0,72.0,93
445,96749,Photo,1,3,1,13,1.0,4412,8070,711,678,926,6937,3478,346,1,55.0,63


In [186]:
# predicted number of shares
predictions = linear_model.predict(test_data)
predictions

308    20.998209
403    37.959092
197    12.498207
99     42.196433
228    84.035946
74     16.903932
245    22.678763
383    16.358898
294    13.309238
445    12.731793
dtype: float64

In [187]:
test_data['share'] = predictions
test_data

Unnamed: 0,page_total_likes,type,category,post_month,post_weekday,post_hour,paid,lifetime_post_total_reach,lifetime_post_total_impressions,lifetime_engaged_users,lifetime_post_consumers,lifetime_post_consumptions,lifetime_post_impressions_by_people_who_have_liked_your_page,lifetime_post_reach_by_people_who_like_your_page,lifetime_people_who_have_liked_your_page_and_engaged_with_your_post,comment,like,total_interactions,share
308,124940,Photo,3,6,2,13,0.0,22464,29650,1029,934,1408,9376,6864,546,3,142.0,163,20.998209
403,107907,Link,1,4,2,6,0.0,70912,94172,1374,1106,1267,42338,27232,788,1,379.0,420,37.959092
197,133594,Photo,2,8,5,13,0.0,1920,3124,365,331,428,2541,1519,251,1,52.0,60,12.498207
99,137020,Photo,1,10,4,10,0.0,12776,21893,785,539,881,13272,7800,575,12,328.0,430,42.196433
228,131728,Photo,2,7,4,3,0.0,14824,21863,868,591,966,13498,8560,650,64,367.0,456,84.035946
74,137893,Video,1,11,3,11,0.0,13544,30235,517,458,667,26622,11760,447,2,99.0,114,16.903932
245,130791,Photo,1,7,2,13,0.0,44464,66824,1052,930,1571,22904,14080,559,4,154.0,188,22.678763
383,109670,Photo,3,4,6,3,0.0,4986,8198,662,614,788,5685,3230,469,2,92.0,122,16.358898
294,125612,Photo,1,6,2,13,0.0,3600,5807,691,656,913,3351,2110,436,0,72.0,93,13.309238
445,96749,Photo,1,3,1,13,1.0,4412,8070,711,678,926,6937,3478,346,1,55.0,63,12.731793


# Logistic Predictions

### >>>>Scroll down>>>

In [149]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [150]:
bc = pd.read_csv('breast-cancer-wisconsin.data')

In [151]:
columns = ['Sample_code_number', 'Clump_Thickness', 'Uniformity_of_Cell_Size', 'Uniformity_of_Cell_Shape',
'Marginal_Adhesion', 'Single_Epithelial_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin',
'Normal_Nucleoli', 'Mitoses', 'Class']
columns = [title.lower() for title in columns[0:-1]]
columns
columns.append('Class')
bc.columns = columns
bc=bc.dropna()
bc['Class']=(bc['Class']/2-1).astype(int)

In [152]:
bc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 698 entries, 0 to 697
Data columns (total 11 columns):
sample_code_number             698 non-null int64
clump_thickness                698 non-null int64
uniformity_of_cell_size        698 non-null int64
uniformity_of_cell_shape       698 non-null int64
marginal_adhesion              698 non-null int64
single_epithelial_cell_size    698 non-null int64
bare_nuclei                    698 non-null object
bland_chromatin                698 non-null int64
normal_nucleoli                698 non-null int64
mitoses                        698 non-null int64
Class                          698 non-null int64
dtypes: int64(10), object(1)
memory usage: 65.4+ KB


In [153]:
bc.head(5)

Unnamed: 0,sample_code_number,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,Class
0,1002945,5,4,4,5,7,10,3,2,1,0
1,1015425,3,1,1,1,2,2,3,1,1,0
2,1016277,6,8,8,1,3,4,3,7,1,0
3,1017023,4,1,1,3,2,1,3,1,1,0
4,1017122,8,10,10,8,7,10,9,7,1,1


In [154]:
bc.describe()

Unnamed: 0,sample_code_number,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bland_chromatin,normal_nucleoli,mitoses,Class
count,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0
mean,1071807.0,4.416905,3.137536,3.210602,2.809456,3.217765,3.438395,2.869628,1.590258,0.345272
std,617532.3,2.817673,3.052575,2.972867,2.856606,2.215408,2.440056,3.055004,1.716162,0.475798
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,870258.2,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,0.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,0.0
75%,1238354.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,1.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,1.0


## Fit the regression model

In [155]:
fitted_model = smf.logit(formula='Class ~ clump_thickness + uniformity_of_cell_size', data=bc).fit()
fitted_model.summary()

Optimization terminated successfully.
         Current function value: 0.152005
         Iterations 9


0,1,2,3
Dep. Variable:,Class,No. Observations:,698.0
Model:,Logit,Df Residuals:,695.0
Method:,MLE,Df Model:,2.0
Date:,"Fri, 22 Nov 2019",Pseudo R-squ.:,0.7641
Time:,14:49:38,Log-Likelihood:,-106.1
converged:,True,LL-Null:,-449.84
,,LLR p-value:,5.188e-150

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-7.1461,0.601,-11.899,0.000,-8.323,-5.969
clump_thickness,0.6174,0.092,6.692,0.000,0.437,0.798
uniformity_of_cell_size,1.1737,0.123,9.536,0.000,0.932,1.415


## Predictions

### One data point

In [158]:
# grab some random data points
# IF WE HAD NEW DATA (NOT YET CLASSIFIED), WE'D USE THAT INSTEAD
indices = np.random.random_integers(0,len(bc),1)
test_data = bc.iloc[indices].drop('Class', axis=1)
test_data

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,sample_code_number,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses
199,1214966,9,7,7,5,5,10,7,8,3


In [159]:
prediction_probalities = fitted_model.predict(test_data)
prediction_probalities

199    0.998677
dtype: float64

In [160]:
theshold = 0.5
predictions = [pp >= theshold for pp in prediction_probalities]
predictions

[True]

In [161]:
test_data['cancer'] = predictions
test_data

Unnamed: 0,sample_code_number,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,cancer
199,1214966,9,7,7,5,5,10,7,8,3,True


And this is the equation it uses - 
https://www.saedsayad.com/images/LogReg_eq.png

### Multiple data points

In [163]:
# grab some random data points
# IF WE HAD NEW DATA (NOT YET CLASSIFIED), WE'D USE THAT INSTEAD
indices = np.random.random_integers(0,len(bc),10)
test_data = bc.iloc[indices].drop('Class', axis=1)
test_data

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,sample_code_number,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses
613,1031608,2,1,1,1,1,1,2,1,1
493,1155967,5,1,2,10,4,5,2,1,1
615,1042252,3,1,1,1,2,1,2,1,1
363,896404,2,1,1,1,2,1,3,1,1
341,814265,2,1,1,1,2,1,1,1,1
322,733823,5,4,6,10,2,10,4,1,1
553,1313658,3,1,1,1,2,1,1,1,1
79,1137156,2,2,2,1,1,1,7,1,1
33,1075123,3,1,2,1,2,1,2,1,1
83,1147699,3,5,7,8,8,9,7,10,7


In [164]:
prediction_probalities = fitted_model.predict(test_data)
prediction_probalities

613    0.008683
493    0.052878
615    0.015981
363    0.008683
341    0.008683
322    0.653751
553    0.015981
79     0.027545
33     0.015981
83     0.639786
dtype: float64

In [165]:
theshold = 0.5
predictions = [pp >= theshold for pp in prediction_probalities]
predictions

[False, False, False, False, False, True, False, False, False, True]

In [166]:
test_data['cancer'] = predictions
test_data

Unnamed: 0,sample_code_number,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,cancer
613,1031608,2,1,1,1,1,1,2,1,1,False
493,1155967,5,1,2,10,4,5,2,1,1,False
615,1042252,3,1,1,1,2,1,2,1,1,False
363,896404,2,1,1,1,2,1,3,1,1,False
341,814265,2,1,1,1,2,1,1,1,1,False
322,733823,5,4,6,10,2,10,4,1,1,True
553,1313658,3,1,1,1,2,1,1,1,1,False
79,1137156,2,2,2,1,1,1,7,1,1,False
33,1075123,3,1,2,1,2,1,2,1,1,False
83,1147699,3,5,7,8,8,9,7,10,7,True
