In [1]:
import sys
import os
if os.getcwd().endswith('notebooks'):
    os.chdir("..")
print(os.getcwd())

C:\Users\BrielleJohnston\Documents\Python\personnal-projects


## Imports 

In [2]:
import re
import numpy as np
from typing import Dict, List, Optional
from IPython.display import clear_output
from IPython.display import HTML, display

import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger') 
from nltk.stem import WordNetLemmatizer
clear_output()

## Functions and Class

In [3]:
def clean_text(contexts:list) -> str:
    """
        contexts: a list of strings
        
        return: cleaned list of strings
    """
    clean_text = []
    for context in contexts:
        text = context.lower() # lowercase text
        text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
        text = BAD_SYMBOLS_RE.sub('', text) # delete bad characters from text aka the one excluded from BAD_SYMBOLS_RE regex
        text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
        clean_text.append(text)
    return clean_text


# Class to get the prediction and most important words
class Results:
    """
    
    A Class to represent the results from a single prediction.
    
    Attributes
    ----------
    string : str
        the string of the context we want to predict.
    model : MultinomialNB()
        the Naive Bayes ml model we are using to make the prediction.
    n_important_words : int
        the number of important words we want to see.
    
    Functions
    -------
    get_prediction(self) -> str:
        Returns the prediction for the string.
    get_important_words(self) -> list:
        Returns the words that contributed the most to the prediction.
        
    """

    def __init__(self, string, model, n_important_words):
        self.string = string
        self.model = model
        self.n_important_words = n_important_words
        self.prediction = self.get_prediction()
        self.important_words = self.get_important_words()
  

    def __repr__(self):
        return f"The text is ====> \n{self.string} \n\nPrediction is ===> {self.prediction}"


    def get_prediction(self):
        self.string_list = []
        self.string_list.append(self.string)
        self.string_trnsfm = clean_text(self.string_list)
        self.string_trnsfm = vectorizer.transform(self.string_list)
        self.prediction = self.model.predict(self.string_trnsfm)[0]
        return self.prediction


    def get_important_words(self):
        map_word_to_rank = {}
        # Cleaning the input string
        string_list = [string]
        clean_string = clean_text(string_list)[0]
        clean_string = re.sub('\d', ' ', clean_string)
        words = clean_string.split()
        # Getting the most important words per label
        map_class_to_coef = dict(zip(self.model.classes_, self.model.feature_log_prob_))
        # sort the coefficients and use the index
        class_coeffs_sorted = map_class_to_coef[self.prediction].argsort()[::-1]
        # how important is each word in the classification
        map_order_to_words = dict(
            zip(np.take(vectorizer.get_feature_names_out(), class_coeffs_sorted), 
                range(len(np.take(vectorizer.get_feature_names_out(), class_coeffs_sorted)))
               )
        )
        for word in words: # rank the words in the string by order of importance
            map_word_to_rank[word] = map_order_to_words.get(word, 99999999)
        important_words = sorted(map_word_to_rank, key=map_word_to_rank.get, reverse=False)[:self.n_important_words] 
        return important_words


## Import the Data 

In [4]:
df = pd.read_csv(os.path.join('data', 'stackoverflow_qs_for_classification.csv')).sample(frac=1)
# Filter for a few types of posts
df = df[df['label'].isin(['python', 'iphone', 'sql', 'java', 'html'])].reset_index(drop=True)
print(df.shape)
df.head()

(10000, 2)


Unnamed: 0,label,post
0,iphone,expected identifier or ( i am working on a we...
1,java,make default value of a short[] element 0 in...
2,iphone,how do i move a block of code in a method for ...
3,python,sublime text 2 not able to build and run syste...
4,python,get intermediate value in python i m trying ...


### Create Train and test sets 

In [5]:
df_train = df.loc[:7500]
df_test = df.loc[7501:]

## Clean and Tokenize the training data

### Make lowercase, remove stopwords and special characters

In [6]:
contexts = df_train['post'].tolist()
stopwords = nltk.corpus.stopwords.words('english')
STOPWORDS = set(stopwords)
REPLACE_BY_SPACE_RE = re.compile(r'[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile(r'[^0-9a-z #+_]')
clean_contexts = clean_text(contexts=contexts)
clean_contexts[:3]

 'make default value short element 0 instead null program creates lists needs assigned values 0 running fine codeint humpty_dumpty new int 20 code optimize size lists set codeshort code program breaking takes zero inputs codeshort humpty_dumpty new short 20 code making default value codenull code way set default zero without iterate entire list via loop wondering way make behavior similar int',

### Tfidf Vectorizer 

In [7]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(clean_contexts)
print("n_samples: %d, n_features: %d" % vectors.shape)

n_samples: 7501, n_features: 70287


## Split into y_train and y_test

In [8]:
X_train = vectors
y_train = df_train['label']

In [9]:
X_test = vectorizer.transform(df_test['post'].to_list())
y_test = df_test['label']

# Model 

## Create Naive Bayes Classifier 

In [10]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

### View Accuracy 

In [11]:
predictions = naive_bayes.predict(X_test)
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))

Accuracy score:  0.9059623849539816


### Take a random test context and see the prediction and important words 

In [21]:
testing_row = df_test.sample(n=1)
string = testing_row['post'].values[0]
results = Results(string, model=naive_bayes, n_important_words=5)
print('\nActual label is: ', testing_row['label'].values[0], '\n')
print(results)
print(f'\nWords that have the most importance in the prediction: {results.get_important_words()}')


Actual label is:  sql 

The text is ====> 
sql: set value with condition  i have some problem  there are two tables  they communicate with the value id. now i will set the value from column <strong>a</strong> in table <strong>a</strong> with the value  nein   but only if the value of the column <strong>b</strong> in table <strong>b</strong> is  0  and  if a.id = b.id.    how can i do that  thanks 

Prediction is ===> sql

Words that have the most importance in the prediction: ['table', 'sql', 'column', 'tables', 'id']
