# Assignment 2 Notebook 

## Import packages
Run the ``` bash setup.sh``` first for this notebook 

In [17]:

# system tools
import os
import sys
sys.path.append("..")

# data munging tools
import pandas as pd
import utils.classifier_utils as clf

# Machine learning stuff
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier

# saving models 
from joblib import dump

## Splitting data into train and test subsets

In [26]:
def train_test(X,y):
    # making the train/test split 
    X_train, X_test, y_train, y_test = train_test_split(X,               # texts for the model
                                                        y,               # classification labels
                                                        test_size=0.2,   # create an 80/20 split (20% test data, 80% train data)
                                                        random_state=42) # random state for reproducibility - like set.seed() in R
    return X_train, X_test, y_train, y_test

## Vectorizing and feature extraction

In [27]:
def vectorize(X_train, X_test):
    # creating vectorizer object - which makes the text features into numerical vectors and makes it easier/faster to work with  
    vectorizer = TfidfVectorizer(ngram_range = (1,2),    # unigrams and bigrams, but can be more 
                                lowercase =  True,       # making everything lowercase 
                                max_df = 0.95,           # removes words in more than 95% of documents, the super common ones 
                                min_df = 0.05,           # removes words in less than 5% of documents, the super rare ones and possibly misspelled words 
                                max_features = 500)      # keep only top 500 features
    # fit the vectorizer to the training data - calculates the mean and variance of each feature 
    # then transforms all the features with the respective mean and variance to scale the training data 
    X_train_feats = vectorizer.fit_transform(X_train)
    # transform test data - using the same mean and variance calculated from the training data to transform the test data 
    # do not want to also fit the test data or the model will learn from that too and will not be an accurate indicator of how the model performs 
    X_test_feats = vectorizer.transform(X_test)
    # save the model 
    dump(vectorizer, os.path.join("..", "models", "tfidf_vectorizer.joblib"))
    return vectorizer, X_train_feats, X_test_feats


## Logistic Regression classifier 

In [28]:
 def logistic_classifier():
    # trained on the training data - logistic regression
    classifier = LogisticRegression(random_state=42)
    classifier.fit(X_train_feats, y_train)
    # using test data to see predictions, based on training data 
    y_pred = classifier.predict(X_test_feats)
    # get predictions 
    y_pred = classifier.predict(X_test_feats)
    # show metrics for how model is performing 
    classifier_metrics = metrics.classification_report(y_test, y_pred)
    print(classifier_metrics)
    # saving the model
    dump(classifier, "LR_classifier.joblib")
    return classifier, classifier_metrics

## Neural Network classifier 

In [29]:
def neural_net_classifier(): 
    # trained on the training data - neural network
    classifier = MLPClassifier(activation = "logistic", # function for the hidden layer, logistic = sigmoid function 
                            hidden_layer_sizes = (20,), # number chosen, in this case 20 means 20 neurons in the 20 hidden layers 
                            max_iter=1000, # max number of iterations until model converges, may be before it reaches this number 
                            random_state = 42) 
    # classifier is fit to the training data to learn 
    classifier.fit(X_train_feats, y_train)
    # get predictions - classifier is now used to get predictions from the test data  
    y_pred = classifier.predict(X_test_feats)
    # save the model 
    dump(classifier, os.path.join("..", "models", "NNclassifier.joblib"))
    return classifier


In [30]:
def save_metrics():
    # save the report to a file
    with open(os.path.join("..", "out", "classification_report.txt"), "w") as f:
        f.write(classifier_metrics)
    return none


In [32]:
filename = os.path.join("..","in","fake_or_real_news.csv")
data = pd.read_csv(filename, index_col=0)
# these column names will be specific to whatever dataset is loaded in
X = data["text"]
y  = data["label"] 


for file in data:
    # splitting into train/test data 
    X_train, X_test, y_train, y_test = train_test(X,y)
    # vectorizing 
    vectorizer, X_train_feats, X_test_feats = vectorize(X_train, X_test)
    # logistic regression classifier 
    classifier, classifier_metrics = logistic_classifier(X_train_feats, y_train, X_train_feats, y_test)
    save_data = save_metrics()
    


TypeError: logistic_classifier() takes 0 positional arguments but 4 were given

In [None]:
def load_data():
    """ Function to load in data 

    Returns:
        X: the text data 
        y: the labels of that data 
    """    
    # making varibles global so they can be used in other functions 
    global X 
    global y
    filename = os.path.join("..","in","fake_or_real_news.csv")
    data = pd.read_csv(filename, index_col=0)
    # these column names will be specific to whatever dataset is loaded in 
    X = data["text"]
    y  = data["label"]
    return X,y # is this right? 