#Topics covered:

- Overfitting and Underfitting:
    - underfitting: a model that is too simple
    - overfitting: a model that is not a good predictor
- DummyRegressor
- Linear Regression (Least Squares)
- Polynomial Features 
- Regression Metrics (how good is your model?)
    - R2 (R square) error: more common
    - Mean Absolute error
    - Mean Squared error

In [None]:
# let's read price of stock over time
def getXAndY():
    import pandas as pd
    df = pd.read_csv('stock.txt')
    
    # we need to do a reshape of the data because train_test_split expects a [[]] and the X is just one column []
    return(df['time'].values.reshape(-1,1),df['price'])

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

(x,y)= getXAndY()   
plt.plot(x,y,'ro')
plt.ylabel('price')
plt.xlabel('time/date')
plt.title('Stock price')
plt.show()

In [None]:
def plotResults(Xtr, Xtst, ytr, ytst, Xplot, y_plot, title):
    import matplotlib.pyplot as plt
    
    tr, = plt.plot(Xtr,ytr,'ro')
    tst, = plt.plot(Xtst,ytst,'go')
    plt.plot(Xplot,y_plot)
    plt.legend([tr, tst], ['Train', 'Test'])
    plt.ylabel('price')
    plt.xlabel('time/date')
    plt.title(title)
    plt.show()
    

In [None]:
# let's make a model that always predict the mean: use DummyRegressor
# r2_score (r-squared): it's a linear regression scoring function with a best score = or close to 1. 
def question1():
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.metrics.regression import r2_score
    from sklearn.dummy import DummyRegressor
    
    (X,y)= getXAndY()
    
    # train_test_split with the default option assigns 75% of the points 
    # to train and 25% to test    
    # use random_state = 0 (or any number) to make sure your results are repeatable
    X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=0)
    
    # returns the mean
    dummyReg = DummyRegressor(strategy='mean').fit(X_train, y_train)
    
    # calculate our predicted values for train and test
    y_pred_train = dummyReg.predict(X_train)
    y_pred_test = dummyReg.predict(X_test)
    
    plotResults(X_train, X_test, y_train, y_test, X_train, y_pred_train,'Dummy Regressor')
    
    #compare the predicted values with the real ones
    q1 = r2_score(y_train, y_pred_train)
    q2 = r2_score(y_test, y_pred_test)
    return (q1,q2)

question1()

In [None]:
# let's do a linear regression (default options) with a polynomial of degree = 1 ( a line)
# print the weights (coefficients)
def question2():
    import numpy as np
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics.regression import r2_score
    import warnings
    warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")
    
    (X,y)= getXAndY()
    
    #complete:
    
    return "r2_score for train and test"

question2()
    

In [None]:
# let's do a regression for a polynomial of degree 2 (y = ax^2 + bx + c)
# PolynomialFeatures turns the data into a polynomial form
# e.g. if we want to turn the data into a polynomial of degree 2:
# and we have one feature x0, it will convert it to: x0, x0^2
# for two features x1, x2, the data will convert to:  x0, x1, x0^2, x0*x1, x1^2
def question3():
    import numpy as np
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures 
    from sklearn.model_selection import train_test_split
    from sklearn.metrics.regression import r2_score
    
    (X,y)= getXAndY()
    X_poly = PolynomialFeatures(degree=2,include_bias=False).fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state=0)
    
    linreg = LinearRegression().fit(X_train, y_train)
    y_pred_train = linreg.predict(X_train)
    y_pred_test = linreg.predict(X_test)
    y_plot = linreg.predict(X_poly)
    
    # plot the original points train and test
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X, y, random_state=0)
    plotResults(X_train_orig, X_test_orig, y_train_orig, y_test_orig, X, y_plot,'Polynomial deg 2')
    
    print(linreg.intercept_, linreg.coef_)
    q1 = r2_score(y_train, y_pred_train)
    q2 = r2_score(y_test, y_pred_test)
    return (q1,q2)

question3()

In [None]:
# let's do a linear regression for a polynomial of grade 10
# return r2 scores for the training and testing set
# change the degree between 3 and 10 what do you see?
def question4():
    
    import numpy as np
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.model_selection import train_test_split
    from sklearn.metrics.regression import r2_score
    
    (X,y)= getXAndY()
    
    deg = 10
    #complete:
    
    return "r2_score for train and test"


question4()

In [None]:
# let's do a for loop from degree 2 through 9 for the data
# capture the r2 scores for the test and training sets for each iteration
# return the degree at which you think it's not overfitting or underfitting (high values for r2 scores
# for both train and test sets)
# extra credit: plot the r2 scores, in the x axis the training set r2, 
# in the y axis the r2 results for the test set, and label each of the points

def question5():
    import numpy as np
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.model_selection import train_test_split
    from sklearn.metrics.regression import r2_score
    
    (X,y)= getXAndY()
    
    return "degree of the polynomial that does not overfit or underfit" 

question5()