# Tuning parameter of tf-idf base model 
# Using first year 10-k & next year stock price(splkit into train & test)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import os
import re
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
def remove_Outliers(dataframe, column_name, outlierConstant = 1.5):
    a = np.array(dataframe[column_name])
    upper_quartile = np.percentile(a, 75)
    lower_quartile = np.percentile(a, 25)
    IQR = (upper_quartile - lower_quartile) * outlierConstant
    quartileSet = (lower_quartile - IQR, upper_quartile + IQR)
    index_list = []
    for i in range(0,len(a)):
        value = a[i]
        if value <= quartileSet[0] or value >= quartileSet[1]:
            index_list.append(i)            
    new_dataframe = dataframe[~dataframe.index.isin(index_list)].reset_index(drop = True)    
#     print('upper_quartile is',round(upper_quartile,3))
#     print('lower_quartile is',round(lower_quartile,3))
#     print('IQR is',round(IQR,3))
#     print('quartileSet is',quartileSet)
#     print('dataset size:', len(a))
#     print('Number of outliers:',len(index_list),'\n')
    return new_dataframe

In [3]:
stop_words_list = []
with open("stop_word_new.txt", "r") as f:
    for line in f:
        stop_words_list.append(str(line.strip()))

In [4]:
# Pre-process file, set exclude_digit = True if do not include digit 
def pre_process(file_content, exclude_digit):
    processed_article = file_content.lower()
    # Decide whether to exclude the digit or not 
    if exclude_digit == False:    # Include digit
        processed_article = re.sub(',', '', processed_article )
        processed_article = re.sub('[^a-zA-Z0-9]', ' ', processed_article )
    else:                         # Exclude digit
        processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )

    processed_article = re.sub(r'\s+', ' ', processed_article)
        
    return processed_article.split()

In [17]:
def Exclude_Stop_words (processed_article, stop_words_list):
    all_words_0 = processed_article    
#     print('There are',len(all_words_0),'words.')
        
    all_words_1 = [word for word in all_words_0 if len(word) >4 ]
#     print('There are',len(all_words_1),'has more than 4 characters.')
    
    all_words_2 = [word for word in all_words_0 if word not in stop_words_list]
#     print('There are',len(all_words_2),'after removed stop_words.\n')

    str = ' '
    return str.join(all_words_2)

In [6]:
def pre_encode_tfidf(path, files):
    X = []
    for filename in files:
        if filename != '.DS_Store':
            with open(path+'/'+filename,'r') as file:
#                 print(filename)
                pre_processed_article = file.read()[3000:]
                processed_article = pre_process(pre_processed_article, True)
                X.append(Exclude_Stop_words(processed_article, stop_words_list))
    return X

# Read match between 10-k file and stok price

In [7]:
df = pd.read_csv('1997_10k_1998_price.csv')
path = '/Users/faustune/Desktop/data/1997.full'

In [8]:
# dfy = dfy[dfy['close_adjusted']<70]
# dfy.hist('close_adjusted')

## Remove Outliers

In [9]:
dfy = remove_Outliers(df,'close_adjusted')
files = dfy['filename'].tolist()
y = dfy['close_adjusted']

# pre-process data: whether( drop stopword; word length<4; include digit)

In [10]:
X0 = pre_encode_tfidf(path, files)

# Transform into tfidf matrix
# parameter: which 'analyzer' used in vector(char/word)

In [11]:
vectorizer = TfidfVectorizer(analyzer='char',stop_words='english')
# vectorizer = TfidfVectorizer(analyzer='word',stop_words='english')

In [12]:
X = vectorizer.fit_transform(X0)

# Split data, train model, calculate MSE for 10 times

In [13]:
avg = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    reg = LinearRegression().fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    print(i)
    mse = mean_squared_error(y_test, y_pred)
    print(mse)
    avg.append(mse)

0
40.7112268177977
1
48.52232311688574
2
42.023805700721994
3
48.11431703175929
4
38.93793312518353
5
44.650736583607696
6
37.91070003195739
7
37.666051543695005
8
32.534756427377715
9
40.56888365807317


# Calculate average MSE of 10 times

In [14]:
sum = 0
for i in avg:
    sum += i
sum/len(avg)

41.16407340370592

In [15]:
# X.toarray()
# X = pd.DataFrame(X.toarray())