#Topics covered:

- Overfitting and Underfitting

- Linear Regression - Least Squares

- DummyRegressor: 
it's a regressor that always returns the mean, or other value from the dataset. Used for comparison with other models

- Polynomial Features: 
Turns the data into a polynomial form
e.g. if we want to turn the data into a polynomial of degree 2:
if we have one feature x0, it will convert it to: x0, x0^2
for two features x1, x2, the data will convert to:  x0, x1, x0^2, x0*x1, x1^2

- R2 (R square):
it's a linear regression scoring function with a best score = or close to 1. For scoring: perfect fit = 1 (e.g. two parallel line), bad = 0 (always predicts the same), ugly = -1 (two orthogonal lines, predict the opposite)


In [None]:
# let's read price of stock over time
def getXAndY():
    import pandas as pd
    df = pd.read_csv('stock.txt')
    return(df['time'].values,df['price'])

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

(x,y)= getXAndY()   
plt.plot(x,y,'ro')
plt.show()

In [None]:
# let's make a model that always predict the mean
# we need to do a reshape of the data because train_test_split expects a [[]] and the data is just one column []
# we do not need to do a reshape when our dataset has more than one column

def question1():
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.metrics.regression import r2_score
    from sklearn.dummy import DummyRegressor
    
    (x,y)= getXAndY()
    X_train, X_test, y_train, y_test = train_test_split(x.reshape(-1,1),
                                                        y, random_state=0)
    dummyReg = DummyRegressor(strategy='mean').fit(X_train, y_train)
    y_pred_train = dummyReg.predict(X_train)
    y_pred_test = dummyReg.predict(X_test)
    plt.plot(x,y,'ro')
    plt.plot(X_train,y_pred_train)
    plt.show()
    q1 = r2_score(y_train, y_pred_train)
    q2 = r2_score(y_test, y_pred_test)
    return (q1,q2)

question1()

In [None]:
# let's do a linear regression with a polynomial of degree = 1 ( a line)
def question2():
    import numpy as np
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics.regression import r2_score
    
    (x,y)= getXAndY()
    X_train, X_test, y_train, y_test = train_test_split(x.reshape(-1,1),
                                                        y, random_state=0)
    linreg = LinearRegression().fit(X_train, y_train)
    y_pred_train = linreg.predict(X_train)
    y_pred_test = linreg.predict(X_test)
    plt.plot(x,y,'ro')
    plt.plot(X_train,y_pred_train)
    plt.show()
    q1 = r2_score(y_train, y_pred_train)
    q2 = r2_score(y_test, y_pred_test)
    return (q1,q2)

question2()
    

In [None]:
# let's do a polynomial 2 regression
def question3():
    import numpy as np
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures 
    from sklearn.model_selection import train_test_split
    from sklearn.metrics.regression import r2_score
    
    (x,y)= getXAndY()
    X_poly = PolynomialFeatures(degree=2,include_bias=False).fit_transform(x.reshape(-1,1))
    X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state=0)
    
    linreg = LinearRegression().fit(X_train, y_train)
    y_pred_train = linreg.predict(X_train)
    y_pred_test = linreg.predict(X_test)
    y_plot = linreg.predict(X_poly)
    plt.plot(x,y,'ro')
    plt.plot(x,y_plot)
    plt.show()
    q1 = r2_score(y_train, y_pred_train)
    q2 = r2_score(y_test, y_pred_test)
    return (q1,q2)

question3()

In [None]:
# let's do a linear regression for a polynomial of grade 10
# return r2 scores for the training and testing set
def question4():
    
    import numpy as np
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.model_selection import train_test_split
    from sklearn.metrics.regression import r2_score
    
    (x,y)= getXAndY()
    X_poly = PolynomialFeatures(degree=10,include_bias=False).fit_transform(x.reshape(-1,1))
    X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state=0)
    
    linreg = LinearRegression().fit(X_train, y_train)
    y_pred_train = linreg.predict(X_train)
    y_pred_test = linreg.predict(X_test)
    y_plot = linreg.predict(X_poly)
    plt.plot(x,y,'ro')
    plt.plot(x,y_plot)
    plt.show()
    q1 = r2_score(y_train, y_pred_train)
    q2 = r2_score(y_test, y_pred_test)
    return (q1,q2)

question4()

In [None]:
# let's do a for loop from degree 2 through 9 for the data
# capture the r2 scores for the test and training sets for each iteration
# return the degree at which you think it's not overfitting or underfitting (high values for r2 scores
# for both train and test sets)
# extra:plot the r2 scores, in the x axis the training set r2, in the y axis the r2 results for the test set
# label each of the points
def question5():
    import numpy as np
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.model_selection import train_test_split
    from sklearn.metrics.regression import r2_score
    
    (x,y)= getXAndY()
    r2_train = []
    r2_test = []
    
    for i in range(2,10):
        X_poly = PolynomialFeatures(degree=i,include_bias=False).fit_transform(x.reshape(-1,1))
        X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state=0)
    
        linreg = LinearRegression().fit(X_train, y_train)
        q1 = r2_score(y_train, linreg.predict(X_train))
        q2 = r2_score(y_test, linreg.predict(X_test))
        r2_train.append(q1)
        r2_test.append(q2)
    
    labels = ['{0}'.format(i) for i in range(2,10)]

    plt.title("R2 score for Train vs Test data for different degrees of polyn")
    plt.plot (r2_train, r2_test, 'r-')
    plt.xlabel("Train")
    plt.ylabel("Test")
    for label, i, j in zip(labels, r2_train[:], r2_test[:]):
        plt.annotate(
            label,
            xy=(i, j), xytext=(-20, 20),
            textcoords='offset points', ha='right', va='bottom',
            bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
            arrowprops=dict(arrowstyle = '->', connectionstyle='arc3,rad=0'))
    plt.show()
    return (6) # seven is as good but maybe overfitting as new data comes

question5()