In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder as ohe
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
#import winsound as beep
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.dummy import DummyClassifier

import nltk
nltk.download('punkt')

def plotRocCurve(data, flag=0):
  
  category_list = ['rpg', 'rpg_gamers', 'anime', 'hardware', 'cars', 'gamernews', 
                   'gamedev', 'computers']
  
  for cat in category_list:

    X_train, X_test, y_train, y_test = getXYandTest(data)

    for i in range(0, len(y_train)):
      if y_train.iloc[i] != cat:
        y_train.iloc[i] = 'blah'

    for i in range(0, len(y_test)):
      if y_test.iloc[i] != cat:
        y_test.iloc[i] = 'blah'

    # Random/Dummy
    if flag == 1:
      classifier = DummyClassifier(strategy='uniform')
      cat += " Random"
    
    # Regular LinearSVC()
    else:
      classifier = LinearSVC()
    
    classifier.fit(X_train, y_train)
    print("\n\nCategory: ", cat)
    metrics.plot_roc_curve(classifier, X_test, y_test)
    plt.show()

def cleanText(text):
  text = BeautifulSoup(text, "lxml").text
  
  # Get rid of numbers
  newText = ''.join([i for i in text if not i.isdigit()])
  newText = newText.replace("_", " ")
  tokens = word_tokenize(newText)
  porter = PorterStemmer()

  # Turn words into their stem words
  stemmed = [porter.stem(word) for word in tokens]
  
  newText = ""
  for stemWord in stemmed:
    newText += " "
    newText += stemWord
  
  #print("\n\nType newText: ", type(newText))
  return newText

def getXY(data):
  x = data.loc[:, data.columns != 'category']
  y = data.category
  return x, y

def getXYandTest(data):
    x, y = getXY(data)
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
    return X_train, X_test, y_train, y_test

def decisionTree(data):
  X_train, X_test, y_train, y_test = getXYandTest(data)
  classifier = DecisionTreeClassifier()
  classifier.fit(X_train, y_train)
  y_predicted = classifier.predict(X_test)
  print("\n\nACCURACY DECISION TREE: ", metrics.accuracy_score(y_test, y_predicted))

def linearSVC(data, test):
  X_train, X_test, y_train, y_test = getXYandTest(data)
  classifier = LinearSVC()
  classifier.fit(X_train, y_train)
  y_predicted = classifier.predict(X_test)
  print("\n\nACCURACY Linear SVC Reddit-train: ", metrics.accuracy_score(y_test, y_predicted))

  x2, y2 = getXY(data)
  classifier2 = LinearSVC()
  classifier2.fit(x2, y2)

  y2predicted = classifier2.predict(X_test)
  print("\n\nACCURACY Linear SVC TEST: ", metrics.accuracy_score(y_test, y2predicted))
  print("\nConfusion Matrix of Linear SVC\n", confusion_matrix(y_test, y2predicted))

  # Predict reddit-test-data.csv for kaggle submission
  X_kaggle_test = test
  y_predicted_test = classifier2.predict(X_kaggle_test)
  print("Y PREDICTED LINEAR SVC\n", y_predicted_test)
  test_submission_df = pd.DataFrame(y_predicted_test, columns=['subreddit'])
  return test_submission_df

class NaiveBayes:
  def setFrequencies(self, data):
    unique_categories = data.groupby('category').sum()
    category_counts = data['category'].value_counts()
    total_counts = unique_categories.sum()
    tot = category_counts.sum()

    self.catCounts = category_counts + 2
    self.wordCounts = total_counts
    self.total = tot
    self.uCat = unique_categories + 1

  def setTheta(self):
    theta = self.catCounts.div(self.total)
    self.theta = theta

  def setThetaOne(self):
    theta = self.uCat.div(self.catCounts, axis=0)
    self.thetaOne = theta

  def setThetaTwo(self):
    theta = self.uCat.sub(self.wordCounts, axis=1)
    theta = theta * (-1)
    zeroCounts = self.total - self.catCounts
    theta = theta.div(zeroCounts, axis=0)
    self.thetaTwo = theta

  def predict(self, data):
    negativeTheta = 1 - self.theta
    logTheta = self.theta.div(negativeTheta)
    logTheta = np.log(logTheta)
    
    logThetaPos = self.thetaOne.div(self.thetaTwo)
    logThetaPos = np.log(logThetaPos)
    logThetaPos = logThetaPos.multiply(data)

    logThetaNeg = (1 - self.thetaOne).div((1 - self.thetaTwo))
    logThetaNeg = np.log(logThetaNeg)
    logThetaNeg = logThetaNeg.multiply(data)

    totalTheta = logThetaPos + logThetaNeg
    totalTheta = totalTheta.sum(axis=1)

    alpha = logTheta + totalTheta
    prediction = alpha[alpha == alpha.max(axis=0)]
    return alpha.idxmax()

# ================================= Main =================================
if __name__ == '__main__':

  redditTrainDf = pd.read_csv('/content/sample_data/reddit-train.csv', delimiter=',')
  redditTestDf = pd.read_csv('/content/sample_data/reddit-test-data.csv', delimiter=',')


  # -----------------------------tfIDF-----------------------------
  tfIDFwordNumpyArray = redditTrainDf['body'].to_numpy()

  # Attempting text cleaning
  countRedditPosts = 0
  while countRedditPosts < len(tfIDFwordNumpyArray):
    tfIDFwordNumpyArray[countRedditPosts] = cleanText(tfIDFwordNumpyArray[countRedditPosts])
    countRedditPosts += 1
  print(tfIDFwordNumpyArray)
  # ------------------------

  # Create and fit the tfIDFvectorizer and create the tfIDFencoded_df
  tfIDFvectorizer = TfidfVectorizer(max_features=15000, stop_words='english')
  tfIDFvectorizer.fit(tfIDFwordNumpyArray)
  tfIDFvector = tfIDFvectorizer.transform(tfIDFwordNumpyArray)
  tfIDFencodedVector = tfIDFvector.toarray()
  tfIDFcategories = redditTrainDf['subreddit'].to_numpy()
  tfIDFencoded_df = pd.DataFrame(tfIDFencodedVector)
  tfIDFencoded_df['category'] = tfIDFcategories
  # ------------------------

  # Get the order of the words in the tfIDFvectorizer
  count = 0
  wordOrder = []
  while count < len(tfIDFvectorizer.vocabulary_):
    for i in tfIDFvectorizer.vocabulary_.keys():
      if tfIDFvectorizer.vocabulary_[i] == count:
        wordOrder.append(i)
        break
    count += 1
  print("\n\ntfIDF word order: ", wordOrder)
  # ------------------------

  print("-tfIDF-tfIDF-tfIDF-tfIDF-tfIDF-tfIDF-tfIDF-tfIDF-tfIDF-tfIDF-tfIDF-tfIDF-\n\n\n") 
  # tfIDF Naive Bayes ------------------------
  #tfIDFnb = NaiveBayes()
  #tfIDFnb.setFrequencies(tfIDFencoded_df)
  #tfIDFnb.setTheta()
  #tfIDFnb.setThetaOne()
  #tfIDFnb.setThetaTwo()
  
  # Encode the data in reddit-test-data.csv with TfIDF Vectorizer 
  tfIDFwordNumpyArrayTest = redditTestDf['body'].to_numpy()

  # Attempting text cleaning on tfIDFwordNumpyArrayTest
  countRedditPosts = 0
  while countRedditPosts < len(tfIDFwordNumpyArrayTest):
    tfIDFwordNumpyArrayTest[countRedditPosts] = cleanText(tfIDFwordNumpyArrayTest[countRedditPosts])
    countRedditPosts += 1
  print(tfIDFwordNumpyArrayTest)
  # ------------------------

  tfIDFvectorTest = tfIDFvectorizer.transform(tfIDFwordNumpyArrayTest)
  tfIDFencodedVectorTest = tfIDFvectorTest.toarray()
  tfIDFencoded_df_reddit_test = pd.DataFrame(tfIDFencodedVectorTest)

  # linear SVC test on reddit-train.csv and reddit-test-data.csv
  test_submission_df = linearSVC(tfIDFencoded_df, tfIDFencoded_df_reddit_test)
  test_submission_df.to_csv('test_submission_df.csv', sep=',', index=True)
  print("-tfIDF-tfIDF-tfIDF-tfIDF-tfIDF-tfIDF-tfIDF-tfIDF-tfIDF-tfIDF-tfIDF-tfIDF-\n\n\n")
  # -----------------------------tfIDF end-----------------------------


  # ----------------------------Count Vectorizer----------------------------
  wordNumpyArray = redditTrainDf['body'].to_numpy()

  # Attempting text cleaning on wordNumpyArray
  countRedditPosts = 0
  while countRedditPosts < len(wordNumpyArray):
    wordNumpyArray[countRedditPosts] = cleanText(wordNumpyArray[countRedditPosts])
    countRedditPosts += 1
  print(wordNumpyArray)
  # ------------------------

  # Create and fit the CountVectorizer and create the encoded_df
  vectorizer = CountVectorizer(max_features=15000, stop_words='english', binary=True)
  vectorizer.fit(wordNumpyArray)
  vector = vectorizer.transform(wordNumpyArray)
  encodedVector = vector.toarray()
  categories = redditTrainDf['subreddit'].to_numpy()
  encoded_df = pd.DataFrame(encodedVector)
  encoded_df['category'] = categories
  # ------------------------

  # Get the order of the words in the CountVectorizer
  count = 0
  wordOrder = []
  while count < len(vectorizer.vocabulary_):
    for i in vectorizer.vocabulary_.keys():
      if vectorizer.vocabulary_[i] == count:
        wordOrder.append(i)
        break
    count += 1
  print("\n\nCountVectorizer word order: ", wordOrder)
  # ------------------------

  print("-Count Vectorizer -Count Vectorizer -Count Vectorizer -Count Vectorizer -Count Vectorizer\n\n\n")
  # CountVectorizer Naive Bayes ------------------------
  #nb = NaiveBayes()
  #nb.setFrequencies(encoded_df)
  #nb.setTheta()
  #nb.setThetaOne()
  #nb.setThetaTwo()

  # Encode the data in reddit-test-data.csv with Count Vectorizer
  wordNumpyArrayTest = redditTestDf['body'].to_numpy()

  # Attempting text cleaning on wordNumpyArrayTest
  countRedditPosts = 0
  while countRedditPosts < len(wordNumpyArrayTest):
    wordNumpyArrayTest[countRedditPosts] = cleanText(wordNumpyArrayTest[countRedditPosts])
    countRedditPosts += 1
  print(wordNumpyArrayTest)
  # ------------------------

  vectorTest = vectorizer.transform(wordNumpyArrayTest)
  encodedVectorTest = vectorTest.toarray()
  encoded_df_reddit_test = pd.DataFrame(encodedVectorTest)

  # linear SVC test on reddit-train.csv and reddit-test-data.csv
  linearSVC(encoded_df, encoded_df_reddit_test)
  print("-Count Vectorizer -Count Vectorizer -Count Vectorizer -Count Vectorizer -Count Vectorizer\n\n\n")
  # ----------------------------Count Vectorizer End----------------------------
  

  # ------------------------------- ROC curve -------------------------------
  plotRocCurve(tfIDFencoded_df)
  plotRocCurve(tfIDFencoded_df, 1)


  """
  results = []
  for i in range(len(encoded_df_reddit_test)):
    results.append(nb.predict(encoded_df_reddit_test.loc[i]))
  
  resultsDF = pd.DataFrame({'subreddit' : results})
  print(results, "\n")
  print(resultsDF)
  resultsDF.to_csv('/content/sample_data/results.csv', sep=',')"""


  """
  USED FOR ITERATIVE TESTING (uses 10-folding)
  k = 10
  start = 90000
  step = 2500
  numSamps = 20
  end = start + (numSamps * step)

  top = 0
  topA = 0

  print("3 fold with shuffle")

  for maxVal in range(start, end, step):
    print("\nIteration:", maxVal)
    kFolds = KFold(n_splits=k, random_state=None)
    classifier = LinearSVC()
    tfIDFvectorizer = TfidfVectorizer(max_features=maxVal, stop_words='english', ngram_range=(1, 3))

    tfIDFvectorizer.fit(tfIDFwordNumpyArray)
    tfIDFvector = tfIDFvectorizer.transform(tfIDFwordNumpyArray)
    tfIDFencodedVector = tfIDFvector.toarray()
    tfIDFencoded_df = pd.DataFrame(tfIDFencodedVector)
    tfIDFencoded_df['category'] = tfIDFcategories

    dataX, dataY = getXY(tfIDFencoded_df)

    crossValResults = cross_val_score(classifier, dataX, dataY, cv = kFolds)
    accuracy = crossValResults.mean()
    if(accuracy>topA):
        topA = accuracy
        top = maxVal

    resultsDF.loc[maxVal-start] = [maxVal] + [accuracy]

  print(resultsDF)
  print("Top: ", top, " @ ", topA)
  #beep.Beep(1000, 440)
  """