# Logistic Regression

### Impoting Libraries & Data

In [1]:
import pandas as pd
import requests
import json
import pickle 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
import numpy as np
import time
np.random.seed(42)

from sklearn.linear_model import LogisticRegressionCV , LogisticRegression

from sklearn.feature_extraction import stop_words



In [2]:
full_data = pd.read_csv('../Data/stock_crypto_data.csv')
stock_data = pd.read_csv('../Data/stock_text_title.csv')
crypto_data = pd.read_csv('../Data/crypto_text_title.csv')

#### Transferring the data added Null values in my dataframe, so I am filling them with an empty space

In [3]:
full_data.isnull().sum()

selftext     1091
title           0
subreddit       0
y               0
dtype: int64

In [4]:
crypto_data.selftext.fillna(' ', inplace = True)
stock_data.selftext.fillna(' ', inplace = True)
full_data.selftext.fillna(' ', inplace = True)

## Train test split
#### Default train test split = Trainset(75%) Testset(25%)

In [5]:
X = full_data[['selftext','title']]
y = full_data.y


X_train, X_test, y_train, y_test = train_test_split(X,y)

#### Exported my X & y to csv 

In [8]:
X.to_csv('../Data/X.csv' , index = False)
y.to_csv('../Data/y.csv', index = False)

In [9]:
my_sw = list(stop_words.ENGLISH_STOP_WORDS) + ['crypto' , 'cryptocurrency' , 'cryptocurrencies' 
              ,'stock' , 'stocks'
             , 'https', 'io','fintel','amp','com','13d','8209','08','gt','www','000','http','07',
             'x200b','daex', 'click']

In [10]:
X_train.shape

(1590, 2)

In [11]:
X.head(3)

Unnamed: 0,selftext,title
0,,The Official r/StockMarket Discord Live Chat Room
1,,Most Anticipated Earnings Releases for the tra...
2,"I'm sure there are many links, but here's the ...",Amazon becomes world's second $1tn company


### Adding my selftext and title together in one corpus in order to proccess it through TFIDF

In [12]:
corpus_tfidf_train = X_train['selftext'] +' ' + X_train['title']
corpus_tfidf_test = X_test['selftext'] +' ' + X_test['title']

In [13]:
corpus_tfidf_train.shape

(1590,)

## Instintiating my TFIDF Vectorizer 
### TFIDF : ( Term Frequency Inverse Document Frequency)

In [14]:
tvec = TfidfVectorizer(stop_words=my_sw
                           )
X_train_fit = tvec.fit(corpus_tfidf_train)
X_train_tran = tvec.transform(corpus_tfidf_train)
X_test_tran = tvec.transform(corpus_tfidf_test)

In [15]:
y_train.head()

331     1
194     1
322     1
266     1
1582    0
Name: y, dtype: int64

### Vectorizers put data into a sprase format, using the 'todense() function will expand the sparse dataframe

In [16]:
df_train  = pd.DataFrame(X_train_tran.todense(),
                   columns=tvec.get_feature_names())

df_test = pd.DataFrame(X_test_tran.todense(),
                   columns=tvec.get_feature_names())


#### Expoprting my X_train, X_test , y_train , y_test that have already been through the Vectorizer to a csv

In [17]:
df_train.to_csv('../Data/tfidf_train_X.csv' , index = False)
df_test.to_csv('../Data/tfidf_test_X.csv', index = False)
y_train.to_csv('../Data/y_train.csv' , index = False )
y_test.to_csv('../Data/y_test.csv', index = False)

## Setting a Logistic Regression Cross Validation
### Pentalty is 'l2' = Ridge regression

In [20]:
logreg = LogisticRegressionCV(penalty = 'l2', 
                              random_state = 42,
                             cv = 5)
logreg.fit(df_train,y_train)
logreg.score(df_train,y_train)

1.0

In [21]:
logreg.score(df_test, y_test)

0.9322033898305084

In [22]:
with open('../Data/logreg.pkl', 'wb+') as f:
    pickle.dump(logreg, f)

## ^Scoring my test set gives me a .932 which explains the accuracy that the model can predict whether or not a post is either in the Stock Subreddit or Cryptocurrency Subreddit

In [18]:
logreg.predict_proba(df_train)

array([[5.52469946e-05, 9.99944753e-01],
       [2.69241455e-03, 9.97307585e-01],
       [3.75754039e-03, 9.96242460e-01],
       ...,
       [9.99999283e-01, 7.17181660e-07],
       [9.99993360e-01, 6.64007287e-06],
       [5.64921987e-03, 9.94350780e-01]])

## The Predict probability shows me the models probability of assurance of whether or not a post should be in Stock vs. Crypto

### Setting a param for my grid search
##### C is the regularzation strength
##### Pentaly l1 or l2 ( Lasso or Ridge)

In [19]:
gs_params = {
    'penalty':['l1','l2'],
    'solver':['liblinear'],
    'C':[1e-3,1e-2,1e-1,1,10,100]
}

In [22]:
lr_gridsearch = GridSearchCV(LogisticRegression(), gs_params, cv=3, verbose=1)
lr_gridsearch = lr_gridsearch.fit(df_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:   13.4s finished


### Grid search best score 

In [23]:
lr_gridsearch.best_score_

0.9050314465408805

In [24]:
lr_gridsearch.best_params_

{'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}