<a href="https://colab.research.google.com/github/ces0157/AI_class_project/blob/Sarah/logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [4]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [7]:
#all nesseccary import statements
from datasets import load_dataset
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bingh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Load Dataset

In [8]:
#load the dataset and split into testing and training
dataset = load_dataset("yelp_review_full")
dp_train = dataset['train'].to_pandas()
dp_test = dataset['test'].to_pandas()

In [9]:
dp_train

Unnamed: 0,label,text
0,4,dr. goldberg offers everything i look for in a...
1,1,"Unfortunately, the frustration of being Dr. Go..."
2,3,Been going to Dr. Goldberg for over 10 years. ...
3,3,Got a letter in the mail last week that said D...
4,0,I don't know what Dr. Goldberg was like before...
...,...,...
649995,4,I had a sprinkler that was gushing... pipe bro...
649996,0,Phone calls always go to voicemail and message...
649997,0,Looks like all of the good reviews have gone t...
649998,4,I was able to once again rely on Yelp to provi...


# Functions

In [39]:
# Creates a count vectorizer with vocab from training data
def create_vectorizer(train_df, column_text):
    sentences = train_df[column_text].values
    vectorizer = CountVectorizer()  # add stop words here instead?

    df = train_df.drop(columns=[column_text])

    # fit the vocabulary to the text data
    vectorizer.fit(sentences)
    print(f'Vocab length: {len(vectorizer.vocabulary_.keys())}')

    return vectorizer

In [40]:
# Passes in dataframe column, converts it a numpy array
# and performs the bag of word algorithm on it using vocab from training data
# Returns a sparse matrix
def create_bag_words(df, vectorizer, column_text):
    sentences = df[column_text].values
    df = df.drop(columns=[column_text])

    # create the bag-of-words model using previously-created vectorizer
    X = vectorizer.transform(sentences)

    return X

# Train Data

In [41]:
y_train = dp_train["label"].values

In [42]:
stop_words = set(stopwords.words('english'))

# Removes stop words from training data and makes everything lowercase
dp_train['no_stops'] = dp_train['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
dp_train['no_stops'] = dp_train['no_stops'].str.lower()

In [43]:
# create vectorizer consisting of vocab from training data
vectorizer = create_vectorizer(dp_train, "no_stops")

# Creates a set of new features modified to be a bag of words
X_fetaures_train = create_bag_words(dp_train, vectorizer, "no_stops")

Vocab length: 242886


In [None]:
print(X_fetaures_train.shape)

In [None]:
# Preforms Logistic Regression on the dataset
lgr = LogisticRegression(max_iter=10000)
lgr.fit(X_fetaures_train, y_train)

In [None]:
# Training on the prediction
predict_train = lgr.predict(X_fetaures_train)
print(classification_report(predict_train, y_train))

In [None]:
# Displays a confusion matrix based on the predicted vs actual values
cm = confusion_matrix(y_train, predict_train)
ConfusionMatrixDisplay(cm).plot()

# Test Data

In [None]:
y_test = dp_test["label"].values

In [None]:
# Removes stop words from test data and makes everything lowercase
dp_test['no_stops'] = dp_test['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
dp_test['no_stops'] = dp_test['no_stops'].str.lower()

In [None]:
# Creates a set of new features modified to be a bag of words using same training vectorizer (don't have to reformat testing!)
X_features_test = create_bag_words(dp_test, vectorizer, "no_stops")

In [None]:
# Gets label predictions from test data and printing results
predict_test = lgr.predict(X_features_test)
print(classification_report(predict_test, y_test))

In [None]:
# Displays a confusion matrix based on the predicted vs actual values
cm = confusion_matrix(y_test, predict_test)
ConfusionMatrixDisplay(cm).plot()