# Modeling

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

import warnings
warnings.filterwarnings("ignore")

from split_scale import split_my_data

## Exercise 1
Using the data on student grades from this lesson, complete the following:

0 Wrangle the data

In [2]:
df = pd.read_csv("student_grades.csv")
df.head()

Unnamed: 0,student_id,exam1,exam2,exam3,final_grade
0,1,100.0,90,95,96
1,2,98.0,93,96,95
2,3,85.0,83,87,87
3,4,83.0,80,86,85
4,5,93.0,90,96,97


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 5 columns):
student_id     104 non-null int64
exam1          103 non-null float64
exam2          104 non-null int64
exam3          104 non-null object
final_grade    104 non-null int64
dtypes: float64(1), int64(3), object(1)
memory usage: 4.2+ KB


In [4]:
df.isnull().sum()

student_id     0
exam1          1
exam2          0
exam3          0
final_grade    0
dtype: int64

In [5]:
df.exam3.value_counts()

96    16
78    16
75    15
87     8
94     8
85     8
86     8
79     8
95     8
70     8
       1
Name: exam3, dtype: int64

In [6]:
df.exam3 = df.exam3.str.strip()

In [7]:
(df.exam3 == '').sum()

1

In [8]:
df.exam3.replace('', np.nan, inplace = True)

In [9]:
df = df.dropna().astype('int')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102 entries, 0 to 103
Data columns (total 5 columns):
student_id     102 non-null int64
exam1          102 non-null int64
exam2          102 non-null int64
exam3          102 non-null int64
final_grade    102 non-null int64
dtypes: int64(5)
memory usage: 4.8 KB


1 Split the data into train and test datasets.

In [12]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, train_size = .8, random_state = 123)
train.head()


Unnamed: 0,student_id,exam1,exam2,exam3,final_grade
86,87,70,65,78,77
21,22,70,65,78,77
31,32,79,70,85,81
33,34,73,70,75,76
102,103,57,65,75,65


2 Create a model that uses exam 1 to predict the final grade.

In [13]:
X = train[['exam1']]
y = train[['final_grade']]
lm = LinearRegression()
lm.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [14]:
y['predicted'] = lm.predict(X)
y = y.rename(columns = {'final_grade': 'actual'})
y.head()

Unnamed: 0,actual,predicted
86,77,75.34156
21,77,75.34156
31,81,82.203999
33,76,77.62904
102,65,65.42915


In [15]:
mse = mean_squared_error(y.actual, y.predicted)
mse

3.4059329666817035

3 Create a model that uses exam 2 to predict the final grade.

In [17]:
X = train[['exam2']]
y = train[['final_grade']]
lm = LinearRegression()
lm.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [18]:
y['predicted'] = lm.predict(X)
y = y.rename(columns = {'final_grade': 'actual'})
y.head()

Unnamed: 0,actual,predicted
86,77,69.57203
21,77,69.57203
31,81,74.445625
33,76,74.445625
102,65,69.57203


In [19]:
mse = mean_squared_error(y.actual, y.predicted)
mse

16.067423388644805

4 Compare your models in the following manner:
- Calculate the mean squared error
- Visualize the residuals. Create a seperate visualization for each model.
- Visualize the actual vs the predicted values. Create a seperate visualization for each model.
- Bonus: Combine the seperate visualizations for each model into a single visualization. Is this visual helpful?

5 Create a model that uses exam 1 and exam 3 to predict final grade. How does this model compare to your previous ones?

In [20]:
X = train[['exam1','exam3']]
y = train[['final_grade']]
 
lm = LinearRegression()
lm.fit(X, y)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [21]:
y['predicted'] = lm.predict(X)
y = y.rename(columns = {'final_grade': 'actual'})
mse = mean_squared_error(y.actual, y.predicted)
mse

2.861172402955809

6 Take your best preforming model and measure its performance on the test data set. How does performance differ between train and test?

In [22]:
X = train[['exam1','exam3']]
y = train[['final_grade']]
 
lm = LinearRegression()
lm.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [23]:
X_test = test[['exam1','exam3']]
y_test = test[['final_grade']]
X_test.head()

Unnamed: 0,exam1,exam3
8,70,78
73,70,78
91,100,95
30,93,96
65,100,95


In [25]:
y_test['predicted']= lm.predict(X_test)
y_test = y_test.rename(columns = {'final_grade': 'actual'})
mse = mean_squared_error(y_test.actual, y_test.predicted)
mse

3.9801217204793495

## Exercise 2
As a customer analyst, I want to know who has spent the most money with us over their lifetime. I have monthly charges and tenure, so I think I will be able to use those two attributes as features to estimate total_charges. I need to do this within an average of $5.00 per customer.

In [27]:
from wrangle import wrangle_telco

1 Run all your previous scripts that acquired, prepared, split, and scaled the telco churn data.

In [28]:
df = wrangle_telco()
df.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,0013-SMEOE,109.7,71,7904.25
1,0014-BMAQU,84.65,63,5377.8
2,0016-QLJIS,90.45,65,5957.9
3,0017-DINOC,45.2,54,2460.55
4,0017-IUDMW,116.8,72,8456.75


In [31]:
train, test = train_test_split(df, train_size = .8, random_state = 123)
X_train = train[['monthly_charges', 'tenure']]
y_train = train[['total_charges']]

X_test = test[['monthly_charges', 'tenure']]
y_test = test[['total_charges']]

In [39]:
from split_scale import min_max_scaler
X_train_scaled, X_test_scaled,scaler = min_max_scaler(X_train, X_test)

2. Fit 3 different linear models to your data, one with just tenure, one with just monthly_charges, and one with both.

In [42]:
#just tenure
X = X_train_scaled[['tenure']]
y = y_train
lm = LinearRegression()
lm.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [43]:
actual = y
prediction = lm.predict(X)

mse = mean_squared_error(actual, prediction)
mse

3981691.454452979

In [45]:
#just montly_charges
X = X_train_scaled[['monthly_charges']]
y = y_train
lm = LinearRegression()
lm.fit(X, y)
actual = y
prediction = lm.predict(X)

mse = mean_squared_error(actual, prediction)
mse

696952.9566968175

In [47]:
#both tenure and montly chargrs
import sklearn.preprocessing
X = X_train_scaled[['monthly_charges', 'tenure']]
y = y_train

poly = sklearn.preprocessing.PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly.fit(X)
X_poly = pd.DataFrame(poly.transform(X), columns=poly.get_feature_names(X.columns))


lm = sklearn.linear_model.LinearRegression()
lm.fit(X_poly, y)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [51]:
X_poly.head()

Unnamed: 0,monthly_charges,tenure,monthly_charges tenure
0,0.569008,0.971831,0.55298
1,0.018934,0.760563,0.0144
2,0.903338,0.901408,0.814277
3,0.796213,0.971831,0.773785
4,0.975087,0.985915,0.961354


In [49]:
actual = y
prediction = lm.predict(X_poly)

mse = mean_squared_error(actual, prediction)
mse

7247.473829639549

In [52]:
lm.coef_

array([[ 111.45015279, 1305.64386784, 7104.14473591]])

3. Evaluate the models and your baseline.

In [56]:

actual = y_train

mse = mean_squared_error(actual, y_train.mean() + y_train - y_train)
mse

6608313.117141247

4. Select the model that performed the best, and apply it to your test data.