# Tuning parameter of tf-idf base model
# 1.Using 10k of first year and stock price of second year to train model
# 2.Using 10k of second year to predict stock price of third year

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import os
import re
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
def remove_Outliers(dataframe, column_name, outlierConstant = 1.5):
    a = np.array(dataframe[column_name])
    upper_quartile = np.percentile(a, 75)
    lower_quartile = np.percentile(a, 25)
    IQR = (upper_quartile - lower_quartile) * outlierConstant
    quartileSet = (lower_quartile - IQR, upper_quartile + IQR)
    index_list = []
    for i in range(0,len(a)):
        value = a[i]
        if value <= quartileSet[0] or value >= quartileSet[1]:
            index_list.append(i)            
    new_dataframe = dataframe[~dataframe.index.isin(index_list)].reset_index(drop = True)    
    return new_dataframe

In [3]:
stop_words_list = []
with open("stop_word_new.txt", "r") as f:
    for line in f:
        stop_words_list.append(str(line.strip()))

In [4]:
# Pre-process file, set exclude_digit = True if do not include digit 
def pre_process(file_content, exclude_digit):
    processed_article = file_content.lower()
    # Decide whether to exclude the digit or not 
    if exclude_digit == False:    # Include digit
        processed_article = re.sub(',', '', processed_article )
        processed_article = re.sub('[^a-zA-Z0-9]', ' ', processed_article )
    else:                         # Exclude digit
        processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )

    processed_article = re.sub(r'\s+', ' ', processed_article)
        
    return processed_article.split()

In [5]:
def Exclude_Stop_words (processed_article, stop_words_list):
    all_words_0 = processed_article        
#     all_words_1 = [word for word in all_words_0 if len(word) >4 ]   
#     all_words_2 = [word for word in all_words_0 if word not in stop_words_list]
    str = ' '
    return str.join(all_words_0)

In [6]:
def pre_encode_tfidf(path, files):
    X = []
    for filename in files:
        if filename != '.DS_Store':
            with open(path+'/'+filename,'r') as file:
#                 print(filename)
                pre_processed_article = file.read()[3000:]
                processed_article = pre_process(pre_processed_article, True)
                X.append(Exclude_Stop_words(processed_article, stop_words_list))
    return X

In [7]:
vectorizer = TfidfVectorizer(analyzer='char',stop_words='english')

In [8]:
df_train = pd.read_csv('1999_10k_2000_price.csv')
path_train = '/Users/faustune/Desktop/data/1999.full'
dfy_train = remove_Outliers(df_train,'close_adjusted')
files_train = dfy_train['filename'].tolist()
y_train = dfy_train['close_adjusted']
X0_train = pre_encode_tfidf(path_train, files_train)
X_train = vectorizer.fit_transform(X0_train)

In [9]:
reg = LinearRegression().fit(X_train, y_train)

In [10]:
df_test = pd.read_csv('2000_10k_2001_price.csv')
path_test = '/Users/faustune/Desktop/data/2000.full'
dfy_test = remove_Outliers(df_test,'close_adjusted')
files_test = dfy_test['filename'].tolist()
y_test = dfy_test['close_adjusted']
X0_test = pre_encode_tfidf(path_test, files_test)
X_test = vectorizer.transform(X0_test)

In [11]:
y_pred = reg.predict(X_test)

In [12]:
mse = mean_squared_error(y_test, y_pred)
mse

226.76287985277656