### Dataset used - Stock_train.csv which tackles the classification problem

In [80]:
#Importing all the necessary libraries

import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, log_loss
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import math
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
import itertools

import tensorflow
from keras.models import Sequential
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.preprocessing.image import ImageDataGenerator
from keras import optimizers

import bs4 as bs
import requests
import yfinance as yf
import datetime
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Reading in the csv file
data = pd.read_csv(r'D:\Stock_train.csv', sep = ",")
data.head()

# Part A

In [None]:
#Finding the percentage of NaN's in each column
percent_missing = round((data.isnull().sum() * 100) / len(data),2)
missing_col_percent = pd.DataFrame({'column_name': data.columns, 'percent_missing': percent_missing})

In [None]:
missing_col_percent

In [None]:
#Finding all the columns with more than 50% of Nans
more_than_fifty = []
for index,row in missing_col_percent.iterrows():
    if row['percent_missing'] > 50:
        more_than_fifty.append(row['column_name'])
more_than_fifty

In [None]:
#Dropping the columns with more than 50% Nans
data = data.drop(more_than_fifty, axis = 1)

In [None]:
#Listing out all the columns with atleast 1 Nan
nan_cols = [i for i in data.columns if data[i].isnull().any()]
nan_cols

In [None]:
#Number of Columns with atleast 1 Nan
len(nan_cols)

In [None]:
#Filling all the Nans with median of each sector's respective column 
data = data.fillna(data.groupby('Sector').transform('median'))

In [None]:
data.info()

In [None]:
#Assigning the stock tickers to a new list
name_stocks = data['Unnamed: 0']

In [None]:
#Seperating the categorical and numerical columns
categorical_values = data[list(data.select_dtypes(include='object').columns)]
numerical_values = data[list(data.select_dtypes(include=['float64', 'int64']).columns)]

In [None]:
#Creating the X and Y datasets and assigning all the numerical columns to X while dropping the class column which is our y column
X = numerical_values.drop(['Class'] , axis = 1)
y = numerical_values['Class']

In [None]:
#Since sector is a categorical variable we need to get_dummies. 
X['Sector'] = categorical_values['Sector']

In [None]:
#Creating dummy variables for Sector variable
dummies = pd.get_dummies(X['Sector'])

In [None]:
#Concatenating the dummes back into out X dataset
X = pd.concat([X, dummies], axis=1)

In [None]:
#Removing the sector varible
X = X.drop(['Sector'], axis=1)

In [None]:
#co-relation data
data.corr()['Class'].abs().sort_values(ascending = False)

In [None]:
#Dropping operating margin column as it has no correlation with out target variable
X = X.drop(['operatingProfitMargin'], axis = 1)


In [None]:
#Scaling our variable to get accurate results
std_scal = StandardScaler()
X_std = std_scal.fit_transform(X)

In [None]:
#Cor-relation amongst each other
X.corr()

In [None]:
#Splitting into train,test and validation datasets
X_train_all, X_test, y_train_all, y_test = train_test_split(X_std, y, test_size = 0.05, random_state = 90)
X_train, X_validation, y_train, y_validation = train_test_split(X_train_all, y_train_all, test_size = 0.20, random_state = 90)

In [None]:
#Helper Functions

#Function to calculate performance results
def performance_results(model, X, y): 
    # Checks the accuracy on the validation sample
    y_hat = model.predict(X)

    #metrics
    conf_mat = confusion_matrix(y, y_hat)
    acc_score = accuracy_score(y, y_hat)
    log_l = log_loss(y, y_hat)

    #graph
    plt.figure(figsize=(10,7))
    sns.heatmap(conf_mat, annot=True, cmap = 'hot_r')
    plt.xlabel('Predicted')
    plt.ylabel('Truth')
    return acc_score, log_l, y_hat

def graph_cv(cv_res):
    # Let's plot the value of the cross-validation to see how the Mean squred error change with alpha
    plt.figure(figsize=(10,5))
    plt.plot(cv_res['param_C'], cv_res['mean_train_score'])
    plt.plot(cv_res['param_C'], cv_res['mean_test_score'])
    plt.xlabel('C')
    plt.ylabel('Accuracy')
    plt.title("Accuracy with penalization")
    plt.legend(['train accuracy', 'test accuracy'], loc='lower right')
    plt.show()

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    
    #This function prints and plots the confusion matrix..
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


# Part B

# Model 1 - Logistic Regression - Ridge Penalization in Classification

In [None]:
#Penalization in classification - Lasso-Ridge
lr_ridge = LogisticRegression(solver = "saga", tol = 0.001,
                                   penalty = 'l2', random_state = 90)


In [None]:
# Calibrating our shrink parameter 'C'
params ={'C':np.linspace(0.0001,0.1, 10)}
lr_ridge_cv = GridSearchCV(estimator = lr_ridge, 
                                    param_grid = params,
                                    scoring ="neg_log_loss",
                                    return_train_score = True,
                                    cv =10)

In [None]:
#Fitting our test and train sets
lr_ridge_cv.fit(X_train, y_train)
cv_results = pd.DataFrame(lr_ridge_cv.cv_results_)

In [None]:
# Let's plot the value of the cross-validation to see how the Mean squred error change with alpha
graph_cv(cv_results)
# this is the value of the best lambda
print("Best penalization parameter (C): ", lr_ridge_cv.best_params_['C'])

In [None]:
# Validation Results
r_acc_score, r_log_l, r_y_hat_validation = performance_results(lr_ridge_cv, X_validation, y_validation)
print("validation acc_score       : ", r_acc_score)
print("validation log loss      : ", r_log_l)

In [None]:
# Comparision of real and predicted values 
name_stocks.iloc[y_validation.index,]
output_df = pd.DataFrame(np.hstack([y_validation.values.reshape(-1,1), r_y_hat_validation.reshape(-1,1)]), 
                        index = list(name_stocks.iloc[y_validation.index,]), 
                        columns = ['Real', 'Predicted'])
output_df

In [None]:
# Test Results
rt_acc_score, rt_log_l, rt_y_hat_test_ridge = performance_results(lr_ridge_cv, X_test, y_test)
print("test acc_score       : ", rt_acc_score)
print("test log loss      : ", rt_log_l)

In [None]:
# Using stock tickers to see the relative data
name_of_stock = list(name_stocks.iloc[y_test.index,])
print("Names stock: ", name_of_stock)

In [None]:
output_df_test = pd.DataFrame(np.hstack([y_test.values.reshape(-1,1), rt_y_hat_test_ridge.reshape(-1,1)]), 
                        index = name_of_stock, 
                        columns = ['Real', 'Predicted Ridge'])
output_df_test


# Model 2 - Logistic Regression - Lasso Penalization for Classification

In [None]:
# Defining our penalized Logistic regression using Lasso penalty
lr_lasso = LogisticRegression(solver = "saga", tol = 0.001,
                                   penalty = 'l1', random_state = 90, class_weight = 'None')
                                   # class_weight = 'balanced'


# Calibrating our shrink parameter 'C'
params ={'C':np.linspace(0.001,1, 10)}
lr_lasso_cv = GridSearchCV(estimator = lr_lasso, 
                                    param_grid = params,
                                    scoring ="neg_log_loss",
                                    return_train_score = True,
                                    cv =10)

In [None]:
# Fitting the model
lr_lasso_cv.fit(X_train, y_train)
cv_results = pd.DataFrame(lr_lasso_cv.cv_results_)

In [None]:
# Let's plot the value of the cross-validation to see how the Mean squred error change with alpha
graph_cv(cv_results)
# this is the value of the best lambda
print("Best penalization parameter (C): ", lr_lasso_cv.best_params_['C'])

In [None]:
# Validation Results
l_acc_score, l_log_l, l_y_hat_val_lasso = performance_results(lr_lasso_cv, X_validation, y_validation)
print("validation acc_score       : ", l_acc_score)
print("validation log loss      : ", l_log_l)

In [None]:
# Test Results
lt_acc_score, lt_log_l, lt_y_hat_test_lasso = performance_results(lr_lasso_cv, X_test, y_test)
print("test acc_score       : ", lt_acc_score)
print("test log loss      : ", lt_log_l)

In [None]:
# Comparision of predicted and reak outputs 
output_df = pd.DataFrame(np.hstack([y_test.values.reshape(-1,1), lt_y_hat_test_lasso.reshape(-1,1), rt_y_hat_test_ridge.reshape(-1,1)]), 
                        index = name_of_stock, 
                        columns = ['Real', 'Predicted Lasso', 'Predicted Ridge'])
output_df


In [None]:
#Output of lasso and ridge predicted probabilities 
lasso_pred_proba = lr_lasso_cv.predict_proba(X_test)
ridge_pred_proba = lr_ridge_cv.predict_proba(X_test)

lasso_pred_proba_df = pd.DataFrame(np.hstack([lasso_pred_proba, lt_y_hat_test_lasso.reshape(-1,1), y_test.values.reshape(-1,1)]),
                    index = name_of_stock, 
                    columns = ['Prob 0', 'Prob 1', 'Lasso pred', 'True'])

ridge_pred_proba_df = pd.DataFrame(np.hstack([ridge_pred_proba, rt_y_hat_test_ridge.reshape(-1,1), y_test.values.reshape(-1,1)]),
                    index = name_of_stock, 
                    columns = ['Prob 0', 'Prob 1', 'Ridge pred', 'True'])


In [None]:
lasso_pred_proba_df

In [None]:
ridge_pred_proba_df

# Model 3 - Random Forest in Classification

In [None]:
#Random Forest 
rf = RandomForestClassifier(random_state = 90)

parameters = {'n_estimators':[2,5,10,15,40,50,80,100,150,200],
             'max_depth':range(1,5)}

rf_cv =GridSearchCV(estimator = rf, 
                    param_grid = parameters,
                    scoring ="neg_log_loss",  
                    return_train_score = True,
                    cv =10)

In [None]:
#Fitting the model
rf_cv.fit(X_train, y_train)
print(rf_cv.best_estimator_)

In [None]:
# Validation results
rf_acc_score, rf_log_l, rf_y_hat_validation = performance_results(rf_cv, X_validation, y_validation)
print("validation acc_score       : ", rf_acc_score)
print("validation log loss      : ", rf_log_l)

# Test results
rft_acc_score, rft_log_l, rft_y_hat_test = performance_results(rf_cv, X_test, y_test)
print("test acc_score       : ", rft_acc_score)
print("test log loss      : ", rft_log_l)

In [None]:
# Output of real and predicted data
output_df = pd.DataFrame(np.hstack([y_test.values.reshape(-1,1), rft_y_hat_test.reshape(-1,1)]), 
                        index = list(name_stocks.iloc[y_test.index].values), 
                        columns = ['Real', 'Random Forest Predicted'])
output_df

# Model 4 - Random Forest Classification with ADA Boosting

In [None]:
#Random Forest Boosting
# ADA Boosting Classifier
dtc = DecisionTreeClassifier(max_depth =3 ,random_state = 90)
adab_clf = AdaBoostClassifier(base_estimator = dtc, random_state = 90)

parameters = {'n_estimators':[2,5,10],
             'learning_rate':np.linspace(1,5,4)}

# Specify the cross-validation
adab_clf_cv =GridSearchCV(estimator = adab_clf, 
                    param_grid = parameters,
                    scoring ="neg_log_loss",  # accuracy
                    return_train_score = True,
                    cv =10)

In [None]:
adab_clf_cv.fit(X_train, y_train)
print(adab_clf_cv.best_estimator_)

In [None]:
#Validation results
ada_acc_score, ada_log_l, ada_y_hat_validation = performance_results(adab_clf_cv, X_validation, y_validation)
print("validation acc_score       : ", ada_acc_score)
print("validation log loss      : ", ada_log_l)

#Test results
adat_acc_score, adat_log_l, adat_y_hat_test = performance_results(adab_clf_cv, X_test, y_test)
print("test acc_score       : ", adat_acc_score)
print("test log loss      : ", adat_log_l)

# Model 5 - Random Forest Classification with Gradient Boosting 

In [None]:
# Gradient Boosting Classifier 

gb_clf = GradientBoostingClassifier(random_state = 90)

parameters = {'n_estimators':[2,5,10],
             'learning_rate':np.linspace(0.1,0.5,5)}

# Specify the cross-validation
gb_clf_cv =GridSearchCV(estimator = gb_clf, 
                    param_grid = parameters,
                    scoring ="neg_log_loss",  # accuracy
                    return_train_score = True,
                    cv =10)

In [None]:
#Fitting the model 
gb_clf_cv.fit(X_train, y_train)
print(gb_clf_cv.best_estimator_)

In [None]:
# Validation results
gb_acc_score, gb_log_l, gb_y_hat_validation = performance_results(gb_clf_cv, X_validation, y_validation)
print("validation acc_score       : ", gb_acc_score)
print("validation log loss      : ", gb_log_l)

# Test results
gbt_acc_score, gbt_log_l, gbt_y_hat_test = performance_results(gb_clf_cv, X_test, y_test)
print("test acc_score       : ", gbt_acc_score)
print("test log loss      : ", gbt_log_l)

# Model 6 - Neural Networks 

In [None]:
#Neural Networks
y_train_nn = np.array(pd.get_dummies(y_train).values)
y_test_nn = np.array(pd.get_dummies(y_test).values)

In [None]:
#Creating a model
def classification_model(x, y, n_neurons):
    model = Sequential()
    model.add(Dense(n_neurons[0], input_dim=x.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(Dense(y.shape[1], kernel_initializer='normal', activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
n_neurons = [80]
classification_model_hl = classification_model(X_train, y_train_nn, n_neurons)
classification_model_hl.summary()

# Fit the model
classification_model_hl.fit(X_train, y_train_nn, epochs=100, batch_size=5, verbose=2)

In [None]:
scores = classification_model_hl.evaluate(X_test, y_test_nn, verbose=0)
print('Test log-loss:', scores[0])
print('Test accuracy:', scores[1])

In [None]:
y_pred = classification_model_hl.predict(X_test)
# Convert predictions probabilities to classes
# We need select which classes has the highest probability
y_pred_classes = np.argmax(y_pred, axis = 1) 

# Compute the confusion matrix
confusion_mtx = confusion_matrix(y_test, y_pred_classes) 


# plot the confusion matrix
plot_confusion_matrix(confusion_mtx, classes = range(y_pred.shape[1]))

# Part C

In [None]:
#Creating lists of all the accuracy scores and log losses for each model 
acc_scores = []
log_loss = []
lables = []
lables.extend(['Lasso', 'Ridge', 'Random Forest', 'ADA Booster', 'Gb' , 'Neural Networks' ])
acc_scores.extend([lt_acc_score, rt_acc_score, rft_acc_score, adat_acc_score, gbt_acc_score, scores[1]])
log_loss.extend([lt_log_l, rt_log_l, rft_log_l, adat_log_l, gbt_log_l, scores[0]])

In [None]:
#Creating a Dataframe of the comparisions 
scores = {'Lables': lables,
          'Accuracy Scores': acc_scores,
           'Log Loss': log_loss}

scores_df = pd.DataFrame(scores)

In [None]:
scores_df

Gradient Booster is the best model for this data


In [None]:
#Loading stock_test dataset
test_data = pd.read_csv(r'D:\Stock_test.csv', sep = ",")
test_data.head()

In [None]:
# Dropping the columns that we dint use for training the model
test_data = test_data.drop(more_than_fifty, axis = 1)

In [None]:
#Replacing all the missing value with medians grouped by each sector
test_data = test_data.fillna(data.groupby('Sector').transform('median'))

In [None]:
#Storing the test stock tickers 
test_name_stocks = test_data['Unnamed: 0']

In [None]:
test_data = test_data.drop(['Unnamed: 0', 'operatingProfitMargin'], axis = 1)

In [None]:
#Creating dummies
dummies_test = pd.get_dummies(test_data['Sector'])

In [None]:
test_data = pd.concat([test_data, dummies_test], axis=1)

In [None]:
test_data = test_data.drop(['Sector'], axis=1)

In [None]:
#Scaling the test data
std_scal = StandardScaler()
test_data_std = std_scal.fit_transform(test_data)

In [None]:
#Since gradient boosting gave us the vest results, we will use that to predict our class for the test set
y_hat_final = gb_clf_cv.predict(test_data_std)

In [None]:
#Converting the results to dataframe
y_df = pd.DataFrame(y_hat_final)

In [None]:
y_df ["Stock tickers"] = test_name_stocks
y_df = y_df[["Stock tickers", 0]]
y_df = y_df.rename(columns={0: "Result"})

In [None]:
#Dataframe with all the stockes that are profitable according to our model
y_df_ones = y_df[y_df["Result"] == 1]

In [None]:
y_df, y_df_ones

# Part C4 

In [None]:
#Downloading the actual 2018 and 2019 prices of our stocks 
one_tickers = list(y_df_ones['Stock tickers'])
Tickers = one_tickers
start = datetime.datetime(2018,12,31)
end = datetime.datetime(2019,12,31)
data = yf.download(Tickers, start = start, end = end, interval = '1d')
prices = data['Adj Close']

In [None]:
#Dropping the columns with no 2018 data 
prices = prices.dropna(axis='columns')

In [None]:
price_2018 = prices.iloc[0,:] #2018 price
price_2019 = prices.iloc[-1,:] #2019 price

series = { '2018 price': price_2018 , '2019 price': price_2019 } #series of the prices

prices_df = pd.DataFrame(series) # dataframe of the prices

prices_df['Price Change'] = prices_df['2019 price'] - prices_df['2018 price'] #computing the difference of the 2 prices

In [None]:
prices_df

In [None]:
#Computing the absolute variation. If we buy 1 unit of each sof the stock, the net change in the portfolio is avsolute variation
absolute_variation = prices_df['Price Change'].sum()

In [None]:
absolute_variation