In [2]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.sparse
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from bs4 import BeautifulSoup
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string

In [3]:
# Read in data from CSV files
train = pd.read_csv("Dataset/train.csv")
test = pd.read_csv("Dataset/test.csv")

In [4]:
# Data preprocessing

# Drop subject/date and concatenating text and title
train["text"] = train["title"] + " " + train["text"]
test["text"] = test["title"] + " " + test["text"]

train = train.drop(["subject", "date", "title"], axis = 1)
test = test.drop(["subject", "date", "title"], axis = 1)

def clean_text_data(data_point):
    review_soup = BeautifulSoup(data_point)
    review_text = review_soup.get_text()
    review_letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    review_lower_case = review_letters_only.lower()  
    review_words = review_lower_case.split() 
    stop_words = stopwords.words("english")
    meaningful_words = [x for x in review_words if x not in stop_words]
        
    return(" ".join(meaningful_words)) 

train["text"] = train["text"].apply(clean_text_data)
test["text"] = test["text"].apply(clean_text_data)



In [5]:
# View cleaned train dataset
train

Unnamed: 0,text,label
0,clinton faces pressure pick vp tough trade wal...,real
1,ryan trump cite positive step toward republica...,real
2,watch president obama dares republicans suppor...,fake
3,hariri warns lebanon faces arab sanctions risk...,real
4,poem twas night cnn christmas acr boiler room ...,fake
...,...,...
39893,lol photo accompanying google search pathologi...,fake
39894,trump wants children would break law donald tr...,fake
39895,gay activists march serb capital behind police...,real
39896,boiler room smoking gunz tune alternate curren...,fake


In [6]:
# View cleaned test dataset
test

Unnamed: 0,text,label
0,factbox taxes budget u congress calendar tight...,real
1,breaking israel worst fears confirmed says isr...,fake
2,u drug enforcement chief step agency reuters u...,real
3,factbox trump twitter oct rex tillerson puerto...,real
4,fcc chief plans ditch u net neutrality rules w...,real
...,...,...
4995,republicans told stop talking healthcare repea...,fake
4996,texas bill restricting insurance coverage abor...,real
4997,montana dems hilariously troll reporter slammi...,fake
4998,trump says gave classified info russia humanit...,fake


In [7]:
# Split data into Pandas Series for X and Y
X_train = pd.DataFrame(train, columns = ['text']).squeeze()
X_test = pd.DataFrame(test, columns = ['text']).squeeze()
Y_train = pd.DataFrame(train, columns = ['label']).squeeze()
Y_test = pd.DataFrame(test, columns = ['label']).squeeze()

In [8]:
# Convert the text into bag-of-words features
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [18]:
class DecisionTree:
    # Constructor, initialize max depth
    def __init__(self, max_depth=None):
        #WORKING
        self.max_depth = max_depth
    
    # Fit tree, calls _build tree helper function
    def fit(self, X, Y):
        #WORKING
        self.tree = self._build_tree(X, Y)
    
    # Make predictions on input data using the trained decision tree model
    def predict(self, X):
        #WORKING
        # Check if X is csr_matrix
        if(isinstance(X, scipy.sparse._csr.csr_matrix)):
            predictions = []
            num_rows, _ = X.shape
            for idx in range(num_rows):
                prediction = self._traverse_tree(X.getrow(idx).toarray()[0], self.tree)
                predictions.append(prediction)
        else:
            predictions = []
            num_rows, _ = X.shape
            for idx in range(num_rows):
                prediction = self._traverse_tree(X.iloc[idx], self.tree)
                predictions.append(prediction)
            
#             predictions = [self._traverse_tree(x, self.tree) for x in X]
        return predictions

    # Build tree by recursively splitting input data on best feature given information gain criterion
    def _build_tree(self, X, Y, depth=0):
        # Initialize variables
        num_samples, num_features = X.shape
        num_classes = len(set(Y))
        best_gain = -1
        best_split_feature = None
        best_split_val = None
        left_X_passed = None
        right_X_passed = None
        left_Y_passed = None
        right_Y_passed = None
        
        # If maximum depth has been reached or all samples belong to the same class, return a leaf node
        if depth == self.max_depth or num_classes == 1:
            return Leaf(Y)
        
        for feature_idx in range(num_features): 
            # Check if X is csr_matrix
            if(isinstance(X, scipy.sparse._csr.csr_matrix)):
                feature_vals = X.getcol(feature_idx).toarray().flat
            else:
                feature_vals = X.iloc[:, feature_idx]
    
            possible_vals = np.unique(feature_vals)
            
            for split_val in possible_vals:
                # Calculate index values
                left_idx = feature_vals <= split_val
                right_idx = feature_vals > split_val
    
                # If either the left or right split is empty, skip this split
                if np.sum(left_idx) == 0 or np.sum(right_idx) == 0:
                    continue
                
                if(isinstance(X, scipy.sparse._csr.csr_matrix)):
                    # Calculate left and right 
                    left_X = X[left_idx, :]
                    right_X = X[right_idx, :]
                    left_Y = Y[left_idx]
                    right_Y = Y[right_idx]
                else:
                    # Calculate left and right 
                    left_X = X.loc[left_idx, :]
                    right_X = X.loc[right_idx, :]
                    left_Y = Y.loc[left_idx]
                    right_Y = Y.loc[right_idx]
                
                # Calcuate gain using information gain helper function
                gain = self._information_gain(Y, left_Y, right_Y)
                
                # If gain is greater than best_gain, update 
                if gain > best_gain:
                    best_gain = gain
                    best_split_feature = feature_idx
                    best_split_val = split_val
                    left_X_passed = left_X
                    right_X_passed = right_X
                    left_Y_passed = left_Y
                    right_Y_passed = right_Y
                    
        # If best_gain is zero, return a leaf
        if best_gain == 0:
            return Leaf(Y)
        
        # If best gain > zero, build left and right tree and return node with new best_split_feature
        left_tree = self._build_tree(left_X_passed, left_Y_passed, depth+1)
        right_tree = self._build_tree(right_X_passed, right_Y_passed, depth+1)
        return Node(best_split_feature, best_split_val, left_tree, right_tree)

    # Traverse the decision tree to predict the class of a test sample
    def _traverse_tree(self, x, node):
        #WORKING
        # If node is a leaf, return the predicted class
        if isinstance(node, Leaf):
            return node.predicted_class
        
        # Check against split_val, traverse left or right tree accordingly
#         print(x)
        if x[node.split_feature] < node.split_val:
            return self._traverse_tree(x, node.left_tree)
        else:
            return self._traverse_tree(x, node.right_tree)

    def _information_gain(self, Y, left_Y, right_Y):
        #WORKING
        # Initialize variables
        num_samples = len(Y)
        num_left = len(left_Y)
        num_right = len(right_Y)
        
        # Calculate entropy before split
        entropy_before_split = self._entropy(Y)
        
        # Calculate entropy after split
        entropy_after_split = ((num_left / num_samples) * self._entropy(left_Y)
                               + (num_right / num_samples) * self._entropy(right_Y))
        
        # Calculate information gain
        return entropy_before_split - entropy_after_split

    def _entropy(self, Y):
        #WORKING
        # Calculate the entropy of a set of samples
        num_samples = len(Y)
        
        # If number of samples is zero, return zero
        if num_samples == 0:
            return 0
        
        _, counts = np.unique(Y, return_counts=True)
        class_probs = counts / num_samples
        
        # Return calculated entropy
        return -np.sum(class_probs * np.log2(class_probs))
    
    def print_tree(self, tree=None, indent=" "):
        #WORKING
        ''' function to print the tree '''
        
        if not tree:
            tree = self.tree
            
        if(isinstance(tree, Leaf)):
            print(tree.predicted_class)

        else:
            print("X_"+str(tree.split_feature), "<=", tree.split_val, "?")
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left_tree, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right_tree, indent + indent)


# Node class definition
class Node:
    def __init__(self, split_feature, split_val, left_tree, right_tree):
        self.split_feature = split_feature
        self.split_val = split_val
        self.left_tree = left_tree
        self.right_tree = right_tree

# Leaf class definition
class Leaf:
    def __init__(self, y):
        self.predicted_class = Counter(y).most_common(1)[0][0]

In [987]:
# NOTE ABOUT SCORES:
# Model was working perfectly before but when running last time... I had made some changes before pushing to git
# and must have changed one small thing that broke it, unforunately, the model took so long to run, I ran out of
# time to see that errors had occured while waiting on accuracy, recall, and f1 score
# Will attempt to resolve ahead of presentation and resubmit if allowed

In [None]:
# TESTS ON IRIS DATASET

In [19]:
col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'type']
csv_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
data = pd.read_csv(csv_url, header = None, names=col_names)

In [21]:
X = data.iloc[:, :-1]
Y = data.iloc[:, -1]

X_train_iris, X_test_iris, Y_train_iris, Y_test_iris = train_test_split(X, Y, test_size=.2, random_state=41)

In [22]:
dt = DecisionTree()
dt.fit(X_train_iris, Y_train_iris)

In [23]:
dt.print_tree()

X_2 <= 1.9 ?
 left:Iris-setosa
 right:X_3 <= 1.5 ?
  left:X_2 <= 4.9 ?
    left:Iris-versicolor
    right:Iris-virginica
  right:X_2 <= 5.0 ?
    left:X_1 <= 2.8 ?
        left:Iris-virginica
        right:X_0 <= 5.9 ?
                left:Iris-versicolor
                right:X_0 <= 6.0 ?
                                left:Iris-virginica
                                right:Iris-versicolor
    right:Iris-virginica


In [24]:
Y_pred = dt.predict(X_test)

In [None]:
# Split data into Pandas Series for X and Y, reduce dataset for model testing
X_train, X_val, Y_train, Y_val = train_test_split(train["text"], train["label"], test_size=0.2, random_state=42)

In [25]:
accuracy_score(Y_test, Y_pred)

0.9333333333333333

In [None]:
# NOTE ABOUT SCORES:
# Model was working perfectly before but when running last time... I had made some changes before pushing to git
# and must have changed one small thing that broke it, unforunately, the model took so long to run, I ran out of
# time to see that errors had occured while waiting on accuracy, recall, and f1 score
# Will attempt to resolve ahead of presentation and resubmit if allowed

In [28]:
# # Split data into Pandas Series for X and Y, reduce dataset for model testing
# X_train_t, X_val_t, Y_train_t, Y_val_t = train_test_split(train["text"], train["label"], test_size=0.2, random_state=42)

In [36]:
# Split data into Pandas Series for X and Y, reduce dataset for model testing
rows_to_drop = train.sample(39798).index
reduced_data = train.drop(rows_to_drop)

# Split data into Pandas Series for X and Y, reduce dataset for model testing
X_train_t, X_val_t, Y_train_t, Y_val_t = train_test_split(reduced_data["text"], reduced_data["label"], test_size=0.2, random_state=42)

In [40]:
# Convert the text into bag-of-words features
vectorizer = CountVectorizer()
X_train_reduced_t = vectorizer.fit_transform(X_train_t)
X_val_reduced_t = vectorizer.transform(X_val_t)

In [55]:
print(max(X_train_reduced_t.getrow(0).toarray()[0]))

5


In [30]:
# Split data into Pandas Series for X and Y
X_train = pd.DataFrame(train, columns = ['text']).squeeze()
X_test = pd.DataFrame(test, columns = ['text']).squeeze()
Y_train = pd.DataFrame(train, columns = ['label']).squeeze()
Y_test = pd.DataFrame(test, columns = ['label']).squeeze()

In [27]:
# Convert the text into bag-of-words features
X_train_reduced = vectorizer.fit_transform(X_train)
X_val_reduced = vectorizer.transform(X_val)

In [None]:
# Create model with a max_depth of 50 trees, train on reduced datasets at first for testing purposes
dt_full = DecisionTree(max_depth = 50)
dt_full.fit(X_train_reduced, Y_train)

In [None]:
# Predict scores
Y_pred = dt.predict(X_val_reduced)

In [None]:
# Accuracy Score
accuracy_score(Y_val, Y_pred)