## Finalized Version Below

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
import os

def get_Indicators(data):
    data['diff'] = data['Adj Close']-data['Adj Close'].shift(1)
    data['gain'] = data['diff'].apply(lambda x: np.maximum(0,x))
    data['loss'] = data['diff'].apply(lambda x: -np.minimum(0,x)) 
    data['avg_gain'] = data['gain']
    data['avg_gain'][14] = data['avg_gain'][1:15].mean()
    data['avg_gain'][1:14] = np.nan
    data['avg_gain'] = data['avg_gain'].ewm(adjust=False,alpha=1/14).mean()
    data['avg_loss'] = data['loss']
    data['avg_loss'][14] = data['avg_loss'][1:15].mean()
    data['avg_loss'][1:14] = np.nan
    data['avg_loss'] = data['avg_loss'].ewm(adjust=False,alpha=1/14).mean() 
    data['Indicator1'] = 100 - 100/(1+data['avg_gain']/data['avg_loss'])
    data_signal = pd.DataFrame(index=data.index)
    data_signal['Close'] = data['Close']
    data_signal['Close diff'] = data['Close'].diff()
    data_signal['Volume'] = data['Volume']
    data_signal['Indicator2'] = np.where(data_signal['Close diff']>0, data_signal['Volume'], np.where(data_signal['Close diff'].fillna(0)==0, 0, -data_signal['Volume'])).cumsum()
    data_signal['Indicator3'] = data_signal['Indicator2'].ewm(com=5).mean() # center of mass = 5
    data['Indicator2'] = data_signal['Indicator2']
    data['Indicator3'] = data_signal['Indicator3']
    data["11"] = data['Adj Close'].ewm(span = 12).mean()
    data["33"] = data["Adj Close"].ewm(span = 26).mean()
    data["Indicator4"] = data["11"] - data["33"]
    data["Indicator4"] = data["Indicator4"].ewm(span = 9).mean()
    data["22"] = data["Adj Close"].ewm(span = 6).mean()
    data["44"] = data["Adj Close"].ewm(span = 13).mean()
    data["Indicator5"] = data["22"] - data["44"]
    data["Indicator5"] = data["Indicator5"].ewm(span = 4).mean()
    return data

# Enter your code here. Read input from STDIN. Print output to STDOUT


def create_classification_trading_condition(df):
    # Step 1
    # <df> is the obtained by: df = yf.download('GOOG', start=start_date, end=end_date)
    #
    # Create dataframes X and Y
    # indexed by the index of the data <df>
    #
    # X should consists of the following columns: 'Open-Close', 'High-Low', 'Indicator1', 'Indicator2', 'Indicator3', 'Indicator4', 'Indicator5', 'good indicator'
    # you need to compute the value of open-close, high-low yourself. i.e. the difference between close and open, high and low each day.
    #
    # Y should consists of one column: 'Target'
    # 'Target' at day n is 1 if the Close of day n+1 is strictly greater than the Close of day n. And 'Target' is -1 otherwise
    #
    # add column 'Target' to df
    #
    # X.head() looks like:
    #             Open-Close  High-Low  Indicator1  ...  Indicator4  Indicator5  good indicator
    #     14    0.109589  0.851807   53.275684  ...   -0.221265   -0.370850        7.173207
    #     15   -1.858036  2.620178   57.836047  ...   -0.209303   -0.235230       -4.418505
    #     16   -0.433376  0.971359   60.796895  ...   -0.168209   -0.032677        1.125408
    #     17   -2.012459  2.595272   65.582630  ...   -0.086004    0.264851       -4.135245
    #     18   -0.717312  2.007477   66.151418  ...    0.019977    0.552832        1.192392

    #     [5 rows x 8 columns]
    #         Open-Close  High-Low  Indicator1  ...  Indicator4  Indicator5  good indicator
    #     14    0.109589  0.851807   53.275684  ...   -0.221265   -0.370850       -0.024711
    #     15   -1.858036  2.620178   57.836047  ...   -0.209303   -0.235230       -7.913241
    #     16   -0.433376  0.971359   60.796895  ...   -0.168209   -0.032677       -2.210700
    #     17   -2.012459  2.595272   65.582630  ...   -0.086004    0.264851        1.939719
    #     18   -0.717312  2.007477   66.151418  ...    0.019977    0.552832        9.527362


    #### your code here ####
    ####
    df['Target'] = np.where(df['Close'].shift(-1)-df['Close']>0,1,-1)
    df['Open-Close'] = df.Open - df.Close
    df['High-Low'] = df.High - df.Low
    df = df.reset_index()
    
    
    # do not change ########
    df=get_Indicators(df) # generate 'Indicator1', 'Indicator2', 'Indicator3', 'Indicator4', 'Indicator5'
    # a simulated very good indicator
    df['good indicator'] = df['Target'] + np.random.normal(0,5,len(df))
    df = df.dropna()
    ########################
    
    X = pd.DataFrame(df[['Open-Close', 'High-Low', 'Indicator1','Indicator2','Indicator3','Indicator4', 'Indicator5', 'good indicator']], index = df.index)
    Y = pd.DataFrame(df[['Target']], index = df.index)

    #### your code here ####
    ####

    return (X,Y)

def create_regression_trading_condition(df):
    # Step 2
    #
    # Create continuous labeled data for regression
    #
    # Similar to step 1
    # The only difference is in the values of Y
    #
    # Y should consists of one column: 'Target'
    # 'Target' at day n is 1 is Close of day n+1 minus the Close of day n
    
    
    #### your code here ####
    ####
    df['Target'] = df.Close.diff()
    df['Target'] = df.Target.shift(periods = -1)
    df['Open-Close'] = df.Open - df.Close
    df['High-Low'] = df.High - df.Low
    df = df.reset_index()
    
    # do not change ########
    df=get_Indicators(df) # generate 'Indicator1', 'Indicator2', 'Indicator3', 'Indicator4', 'Indicator5'
    # a simulated very good indicator
    df['good indicator'] = df['Target'] + np.random.normal(0,5,len(df))
    df = df.dropna()
    ########################

    #### your code here ####
    ####
    X = pd.DataFrame(df[['Open-Close', 'High-Low', 'Indicator1','Indicator2','Indicator3','Indicator4', 'Indicator5', 'good indicator']], index = df.index)
    Y = pd.DataFrame(df[['Target']], index = df.index)
    #X.drop(index=range(14), axis=0, inplace = True)
    #Y.drop(index=range(14), axis=0, inplace = True)
    return (X,Y)

def create_train_split_group(X,Y,split_ratio=0.8):
    # Step 3
    # Use train_test_split function to split the dataset with specific split ratio
    X_train, X_test, y_train, y_test = train_test_split(X,Y, train_size = split_ratio,shuffle=False)
    
    return X_train, X_test, y_train, y_test

def OrdinaryLinearRegression(data):
    np.random.seed(1)
    # Step 4 (test1)
    # get training and testing dataset from previous steps
    # run linear_model.LinearRegression on training data
    # return the trained model 
    X,Y = create_regression_trading_condition(data)
    X_train, X_test, y_train, y_test = create_train_split_group(X,Y,split_ratio=0.8)
    lr = linear_model.LinearRegression()
    lr.fit(X_train, y_train)
    return lr
    
def your_lasso(data):
    np.random.seed(1)
    # Step 5 (test2)
    # get training and testing dataset from previous steps
    # run linear_model.Lasso on training data with alpha=0.1
    # return the trained model 
    X,Y = create_regression_trading_condition(data)
    X_train, X_test, y_train, y_test = create_train_split_group(X,Y,split_ratio=0.8)
    lasso = linear_model.Lasso(alpha=.1)
    lasso.fit(X_train, y_train)
    return lasso

def your_ridge(data):
    np.random.seed(1)
    # Step 6 (test3)
    # get training and testing dataset from previous steps
    # run linear_model.Ridge on training data with alpha = 10000
    # return the trained model 
    X,Y = create_regression_trading_condition(data)
    X_train, X_test, y_train, y_test = create_train_split_group(X,Y,split_ratio=0.8)
    ridge = linear_model.Ridge(alpha=10_000)
    ridge.fit(X_train, y_train)
    return ridge

def your_pca_knn(data):
    np.random.seed(1)
    # step 7 (test4)
    # Obtain X, Y from the first step (classification data)
    # Use StandardScalar to normalize X
    # Split dataset with ratio = 0.8
    # apply PCA to training data with n_components=0.9
    # Run KNeighborsClassifier on training data
    X,Y = create_classification_trading_condition(df)
    scale = StandardScaler()
    X = scale.fit_transform(X)
    X_train, X_test, Y_train, Y_test = create_train_split_group(X,Y,split_ratio=0.8)
    pca = PCA(n_components=.9)
    pca.fit(X_train)
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    nn = KNeighborsClassifier()
    nn.fit(X_train_pca,Y_train)
    

    return nn, X_train_pca, Y_train, X_test_pca, Y_test


def your_lda_knn(data):
    np.random.seed(1)
    # step 8 (test5)
    # Obtain X, Y from the first step (classification data)
    # Use StandardScalar to normalize X
    # Split dataset with ratio = 0.8
    # run LDA on training data, and transform X data
    # Run KNeighborsClassifier on transformed training data
    X,Y = create_classification_trading_condition(data)
    scale = StandardScaler()
    X = scale.fit_transform(X)
    X_train, X_test, Y_train, Y_test = create_train_split_group(X,Y,split_ratio=0.8)
    lda = LinearDiscriminantAnalysis()
    lda.fit(X_train,Y_train)
    X_train = lda.transform(X_train)
    X_test = lda.transform(X_test)
    nn = KNeighborsClassifier()
    nn.fit(X_train, Y_train)
    
    return nn, X_train, Y_train, X_test, Y_test

def test1(data):
    ols = OrdinaryLinearRegression(data)
    X,Y = create_regression_trading_condition(data)
    X_train, X_test, Y_train, Y_test = create_train_split_group(X, Y, split_ratio=0.8)
    print('Coefficients: \n', ols.coef_)
    print("Mean squared error: %.2f"% mean_squared_error(Y_train, ols.predict(X_train)))
    print("Variance score: %.2f"% r2_score(Y_train, ols.predict(X_train)))
    print("Mean squared error: %.2f"% mean_squared_error(Y_test, ols.predict(X_test)))
    print("Variance score: %.2f"% r2_score(Y_test, ols.predict(X_test)))

def test2(data):
    lasso = your_lasso(data)
    X,Y = create_regression_trading_condition(data)
    X_train, X_test, Y_train, Y_test = create_train_split_group(X, Y, split_ratio=0.8)
    print('Coefficients: \n', lasso.coef_)
    print("Mean squared error: %.2f"% mean_squared_error(Y_train, lasso.predict(X_train)))
    print("Variance score: %.2f"% r2_score(Y_train, lasso.predict(X_train)))
    print("Mean squared error: %.2f"% mean_squared_error(Y_test, lasso.predict(X_test)))
    print("Variance score: %.2f"% r2_score(Y_test, lasso.predict(X_test)))

def test3(data):
    ridge = your_ridge(data)
    X,Y = create_regression_trading_condition(data)
    X_train, X_test, Y_train, Y_test = create_train_split_group(X, Y, split_ratio=0.8)
    ridge = linear_model.Ridge(alpha=10000)
    ridge.fit(X_train, Y_train)

    print('Coefficients: \n', ridge.coef_)
    print("Mean squared error: %.2f"% mean_squared_error(Y_train, ridge.predict(X_train)))
    print("Variance score: %.2f"% r2_score(Y_train, ridge.predict(X_train)))
    print("Mean squared error: %.2f"% mean_squared_error(Y_test, ridge.predict(X_test)))
    print("Variance score: %.2f"% r2_score(Y_test, ridge.predict(X_test)))

def test4(data):
    X,Y = create_classification_trading_condition(data)
    nn, X_train_pca, Y_train, X_test_pca, Y_test = your_pca_knn(data)
    print(nn.score(X_train_pca, Y_train), nn.score(X_test_pca, Y_test))

def test5(data):
    X,Y = create_classification_trading_condition(data)
    nn, X_train, Y_train, X_test, Y_test = your_lda_knn(data)
    print(nn.score(X_train, Y_train), nn.score(X_test, Y_test))

if __name__ == '__main__':
    # fptr = open(os.environ['OUTPUT_PATH'], 'w')
    # fptr = sys.stdout
    tmp = input()
    row_num = int(input().strip())
    Data = []
    col_names = list(map(str, input().split('\t')))
    for i in range(row_num):
        line=list(map(str, input().split('\t')))
        line[0] = pd.to_datetime(line[0])
        for j in range(1,6,1):
            line[j] = float(line[j])
        line[6] = int(line[6])
        Data.append(line)
    df = pd.DataFrame(Data, columns= col_names)
    df.set_index('Date')
   
    np.random.seed(1)
    if tmp == '1':
        test1(df)
    elif tmp == '2':
        test2(df)
    elif tmp == '3':
        test3(df)
    elif tmp == '4':
        test4(df)
    elif tmp == '5':
        test5(df)
    else:
      raise RuntimeError('invalid input')

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
import os

In [170]:
import yfinance as yf
df = yf.download('GOOG', start="2004-08-19", end="2015-08-10")

[*********************100%***********************]  1 of 1 completed


In [177]:
def create_classification_trading_condition(df):
    # Step 1
    # <df> is the obtained by: df = yf.download('GOOG', start=start_date, end=end_date)
    #
    # Create dataframes X and Y
    # indexed by the index of the data <df>
    #
    # X should consists of the following columns: 'Open-Close', 'High-Low', 'Indicator1', 'Indicator2', 'Indicator3', 'Indicator4', 'Indicator5', 'good indicator'
    # you need to compute the value of open-close, high-low yourself. i.e. the difference between close and open, high and low each day.
    #
    # Y should consists of one column: 'Target'
    # 'Target' at day n is 1 if the Close of day n+1 is strictly greater than the Close of day n. And 'Target' is -1 otherwise
    #
    # add column 'Target' to df
    #
    # X.head() looks like:
    #             Open-Close  High-Low  Indicator1  ...  Indicator4  Indicator5  good indicator
    #     14    0.109589  0.851807   53.275684  ...   -0.221265   -0.370850        7.173207
    #     15   -1.858036  2.620178   57.836047  ...   -0.209303   -0.235230       -4.418505
    #     16   -0.433376  0.971359   60.796895  ...   -0.168209   -0.032677        1.125408
    #     17   -2.012459  2.595272   65.582630  ...   -0.086004    0.264851       -4.135245
    #     18   -0.717312  2.007477   66.151418  ...    0.019977    0.552832        1.192392

    #     [5 rows x 8 columns]
    #         Open-Close  High-Low  Indicator1  ...  Indicator4  Indicator5  good indicator
    #     14    0.109589  0.851807   53.275684  ...   -0.221265   -0.370850       -0.024711
    #     15   -1.858036  2.620178   57.836047  ...   -0.209303   -0.235230       -7.913241
    #     16   -0.433376  0.971359   60.796895  ...   -0.168209   -0.032677       -2.210700
    #     17   -2.012459  2.595272   65.582630  ...   -0.086004    0.264851        1.939719
    #     18   -0.717312  2.007477   66.151418  ...    0.019977    0.552832        9.527362


    #### your code here ####
    ####
    df['Target'] = np.where(df['Close'].shift(-1)-df['Close']>0,1,-1)
    df['Open-Close'] = df.Open - df.Close
    df['High-Low'] = df.High - df.Low
    df = df.reset_index()
    
    
    # do not change ########
    df=get_Indicators(df) # generate 'Indicator1', 'Indicator2', 'Indicator3', 'Indicator4', 'Indicator5'
    # a simulated very good indicator
    df['good indicator'] = df['Target'] + np.random.normal(0,5,len(df))
    df = df.dropna()
    ########################
    
    X = pd.DataFrame(df[['Open-Close', 'High-Low', 'Indicator1','Indicator2','Indicator3','Indicator4', 'Indicator5', 'good indicator']], index = df.index)
    Y = pd.DataFrame(df[['Target']], index = df.index)
    

    #### your code here ####
    ####

    return (X,Y)

def create_regression_trading_condition(df):
    # Step 2
    #
    # Create continuous labeled data for regression
    #
    # Similar to step 1
    # The only difference is in the values of Y
    #
    # Y should consists of one column: 'Target'
    # 'Target' at day n is 1 is Close of day n+1 minus the Close of day n
    
    
    #### your code here ####
    ####
    df['Target'] = df.Close.diff()
    df['Target'] = df.Target.shift(periods = -1)
    df['Open-Close'] = df.Open - df.Close
    df['High-Low'] = df.High - df.Low
    df = df.reset_index()
    
    # do not change ########
    df=get_Indicators(df) # generate 'Indicator1', 'Indicator2', 'Indicator3', 'Indicator4', 'Indicator5'
    # a simulated very good indicator
    df['good indicator'] = df['Target'] + np.random.normal(0,5,len(df))
    df = df.dropna()
    ########################

    #### your code here ####
    ####
    X = pd.DataFrame(df[['Open-Close', 'High-Low', 'Indicator1','Indicator2','Indicator3','Indicator4', 'Indicator5', 'good indicator']], index = df.index)
    Y = pd.DataFrame(df[['Target']], index = df.index)
    return (X,Y)

def create_train_split_group(X,Y,split_ratio=0.8):
    # Step 3
    # Use train_test_split function to split the dataset with specific split ratio
    X_train, X_test, y_train, y_test = train_test_split(X,Y, train_size = split_ratio,shuffle=False)
    
    return X_train, X_test, y_train, y_test

def OrdinaryLinearRegression(data):
    np.random.seed(1)
    # Step 4 (test1)
    # get training and testing dataset from previous steps
    # run linear_model.LinearRegression on training data
    # return the trained model 
    X,Y = create_regression_trading_condition(data)
    X_train, X_test, y_train, y_test = create_train_split_group(X,Y,split_ratio=0.8)
    lr = linear_model.LinearRegression()
    lr.fit(X_train, y_train)
    return lr
    
def your_lasso(data):
    np.random.seed(1)
    # Step 5 (test2)
    # get training and testing dataset from previous steps
    # run linear_model.Lasso on training data with alpha=0.1
    # return the trained model 
    X,Y = create_regression_trading_condition(data)
    X_train, X_test, y_train, y_test = create_train_split_group(X,Y,split_ratio=0.8)
    lasso = linear_model.Lasso(alpha=.1)
    lasso.fit(X_train, y_train)
    return lasso

def your_ridge(data):
    np.random.seed(1)
    # Step 6 (test3)
    # get training and testing dataset from previous steps
    # run linear_model.Ridge on training data with alpha = 10000
    # return the trained model 
    X,Y = create_regression_trading_condition(data)
    X_train, X_test, y_train, y_test = create_train_split_group(X,Y,split_ratio=0.8)
    ridge = linear_model.Ridge(alpha=10_000)
    ridge.fit(X_train, y_train)
    return ridge

def your_pca_knn(data):
    np.random.seed(1)
    # step 7 (test4)
    # Obtain X, Y from the first step (classification data)
    # Use StandardScalar to normalize X
    # Split dataset with ratio = 0.8
    # apply PCA to training data with n_components=0.9
    # Run KNeighborsClassifier on training data
    X,Y = create_classification_trading_condition(df)
    scale = StandardScaler()
    scale.transform(X)
    X_train, X_test, Y_train, Y_test = create_train_split_group(X,Y,split_ratio=0.8)
    pca = PCA(n_components=.9)
    pca.fit(X_train)
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    nn = KNeighborsClassifier()
    nn.fit(X_train_pca,Y_train)
    

    return nn, X_train_pca, Y_train, X_test_pca, Y_test


def your_lda_knn(data):
    np.random.seed(1)
    # step 8 (test5)
    # Obtain X, Y from the first step (classification data)
    # Use StandardScalar to normalize X
    # Split dataset with ratio = 0.8
    # run LDA on training data, and transform X data
    # Run KNeighborsClassifier on transformed training data
    X,Y = create_classification_trading_condition(df)
    scale = StandardScaler()
    scale.fit_transform(X)
    X_train, X_test, Y_train, Y_test = create_train_split_group(X,Y,split_ratio=0.8)
    lda = LinearDiscriminantAnalysis()
    lda.fit_transform(X_train,Y_train)
    nn = KNeighborsClassifier()
    nn.fit(X_train)
    
    return nn, X_train, Y_train, X_test, Y_test

In [None]:
def test5(data):
    X,Y = create_classification_trading_condition(data)
    nn, X_train, Y_train, X_test, Y_test = your_lda_knn(data)
    print(nn.score(X_train, Y_train), nn.score(X_test, Y_test))

In [None]:
if __name__ == '__main__':
    # fptr = open(os.environ['OUTPUT_PATH'], 'w')
    # fptr = sys.stdout
    tmp = input()
    row_num = int(input().strip())
    Data = []
    col_names = list(map(str, input().split('\t')))
    for i in range(row_num):
        line=list(map(str, input().split('\t')))
        line[0] = pd.to_datetime(line[0])
        for j in range(1,6,1):
            line[j] = float(line[j])
        line[6] = int(line[6])
        Data.append(line)
    df = pd.DataFrame(Data, columns= col_names)
    df.set_index('Date')
   
    np.random.seed(1)
    if tmp == '1':
        test1(df)
    elif tmp == '2':
        test2(df)
    elif tmp == '3':
        test3(df)
    elif tmp == '4':
        test4(df)
    elif tmp == '5':
        test5(df)
    else:
      raise RuntimeError('invalid input')

In [51]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2004-08-19,49.81329,51.835709,47.800831,49.982655,49.982655,44871361
2004-08-20,50.316402,54.336334,50.062355,53.95277,53.95277,22942874
2004-08-23,55.168217,56.528118,54.321388,54.495735,54.495735,18342897
2004-08-24,55.4123,55.591629,51.591621,52.239197,52.239197,15319808
2004-08-25,52.284027,53.798351,51.746044,52.802086,52.802086,9232276
