In [1]:
#          /scratch/eecs448w24_class_root/eecs448w24_class/shared_data/valr

# Libraries and Data Read-in

In [2]:
# Common libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Linear regression Question libraries
from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge

# Logistic Regression Question Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import seaborn as sns

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /home/atorell/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/atorell/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/atorell/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
file_path = '/scratch/eecs448w24_class_root/eecs448w24_class/shared_data/valr/Combined_Goodreads_Data.csv'
df = pd.read_csv(file_path)

In [6]:
print(df.head(10))
print(df["publication_date"])

   Unnamed: 0  book_id                                              title  \
0           1        1  Harry Potter and the Half-Blood Prince (Harry ...   
1           2        1  Harry Potter and the Half-Blood Prince (Harry ...   
2           3        1  Harry Potter and the Half-Blood Prince (Harry ...   
3           4        1  Harry Potter and the Half-Blood Prince (Harry ...   
4           5        1  Harry Potter and the Half-Blood Prince (Harry ...   
5           6        1  Harry Potter and the Half-Blood Prince (Harry ...   
6           7        1  Harry Potter and the Half-Blood Prince (Harry ...   
7           8        1  Harry Potter and the Half-Blood Prince (Harry ...   
8           9        1  Harry Potter and the Half-Blood Prince (Harry ...   
9          10        1  Harry Potter and the Half-Blood Prince (Harry ...   

                      authors  average_rating        isbn         isbn13  \
0  J.K. Rowling/Mary GrandPré            4.57  0439785960  9780439785969   


# Exploratory Data Analysis

In [4]:
# there are 999 rows and 11 features
df.shape

(54951, 23)

In [5]:
# 4 features are numerical and the rest are strings 
# 2 features have null rows 
# we can also see the column names 
df.info()

# because we know we have some missing values:
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54951 entries, 0 to 54950
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          54951 non-null  int64  
 1   book_id             54951 non-null  int64  
 2   title               54951 non-null  object 
 3   authors             54951 non-null  object 
 4   average_rating      54951 non-null  float64
 5   isbn                54951 non-null  object 
 6   isbn13              54951 non-null  int64  
 7   language_code       54951 non-null  object 
 8   num_pages           54951 non-null  int64  
 9   ratings_count       54951 non-null  int64  
 10  text_reviews_count  54951 non-null  int64  
 11  publication_date    54951 non-null  object 
 12  publisher           54951 non-null  object 
 13  user_id             54951 non-null  object 
 14  review_id           54951 non-null  object 
 15  rating              54951 non-null  int64  
 16  revi

Unnamed: 0                0
book_id                   0
title                     0
authors                   0
average_rating            0
isbn                      0
isbn13                    0
language_code             0
num_pages                 0
ratings_count             0
text_reviews_count        0
publication_date          0
publisher                 0
user_id                   0
review_id                 0
rating                    0
review_text               0
date_added                0
date_updated              0
read_at                9342
started_at            25396
n_votes                   0
n_comments                0
dtype: int64

In [None]:
plt.figure(figsize=(6, 4))
 
sns.pairplot(df)
 
plt.suptitle('Pair Plot for DataFrame')
plt.show()

In [None]:
# when the rating are higher, there are more votes 
ratings = df['rating'].value_counts()
plt.bar(ratings.index, ratings)
plt.title('Count Plot of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()
#sns.boxplot(x="rating", y="n_votes", data=df)

In [None]:
# when there are higher ratings, there are more comments. But this is not as significant as votes. 
# comments = df['n_comments'].value_counts()
# plt.bar(comments.index, df['rating'])
# sns.boxplot(x="rating", y="n_comments", data=df)
comments_per_rating = df.groupby('rating')['n_comments'].sum()

# Plot the number of comments per rating category
plt.bar(comments_per_rating.index, comments_per_rating.values, color ="orange")
plt.xlabel('Rating')
plt.ylabel('Number of Comments')
plt.title('Number of Comments per Rating Category')
plt.show()

In [None]:
votes_per_rating = df.groupby('rating')['n_votes'].sum()

# Plot the number of comments per rating category
plt.bar(votes_per_rating.index, votes_per_rating.values, color="magenta")
plt.xlabel('Rating')
plt.ylabel('Number of Votes')
plt.title('Number of Votes per Rating Category')
plt.show()

Simple exploratory data analysis is helpful but not exptremely helpful for our dataset because there are a lot of qualitative variables including our feature of interest, reviews. 

# Pre-Processing

In [4]:
nltk_stopwords = set(stopwords.words('english'))

def PreprocessSentence(sentence, nltk_stopwords):
    # Input - a sentence in the form of a python string, and the set of stopwords
    # Output - the preprocessed sentence, as per the instructions
    
    sentence = sentence.lower()
    tokens = word_tokenize(sentence)
    #remove stopwords and single char non-num words
    tokens = [word for word in tokens if word not in nltk_stopwords and (word.isnumeric() or len(word) > 1)]
    preprocessed_sentence = ' '.join(tokens)
    return preprocessed_sentence

def PreprocessData(df, nltk_stopwords):
    # Input - the dataframe, with columns label and text, and the set of stopwords
    # output - the dataframe with the text processed as described earlier
    df['review_text'] = df['review_text'].map(lambda x: PreprocessSentence(x, nltk_stopwords))
    return df

In [5]:
df = PreprocessData(df, nltk_stopwords)

In [6]:
df.rating

0        5
1        3
2        5
3        5
4        5
        ..
54946    4
54947    3
54948    3
54949    3
54950    3
Name: rating, Length: 54951, dtype: int64

# Basic Random Model Based on Overall Distribution of Ratings

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

np.random.seed(80)

X = df['review_text']
y = df['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

distribution = np.bincount(y_train)
print(distribution)

y_pred = np.random.choice(len(distribution), size=len(X_test), p=distribution/len(y_train))

accuracy_random = accuracy_score(y_test, y_pred)
print(f'Random Model Accuracy: {accuracy_random:.2f}')

print(classification_report(y_test, y_pred))




[ 1246  1887  4048  9757 14527 12495]
Random Model Accuracy: 0.25
              precision    recall  f1-score   support

           0       0.02      0.02      0.02       311
           1       0.03      0.03      0.03       480
           2       0.10      0.10      0.10      1049
           3       0.22      0.23      0.23      2434
           4       0.32      0.32      0.32      3610
           5       0.28      0.28      0.28      3107

    accuracy                           0.25     10991
   macro avg       0.16      0.16      0.16     10991
weighted avg       0.24      0.25      0.24     10991



# Bag of Words/N-grams
Code adapted from: https://towardsdatascience.com/leveraging-n-grams-to-extract-context-from-text-bdc576b47049

In [None]:
# had to all these imports in this cell because it oddly wasn't working if it was imported in an earlier cell
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB

# define X and y data
X = df['review_text']
y = df['rating']


# initialize count vectorizer
bow_cv = CountVectorizer()

# create document-term matrix
X_cv = bow_cv.fit_transform(X)

# classify with logistic regression

np.random.seed(80)
X_train, X_test, y_train, y_test = train_test_split(X_cv, y, test_size=0.2)
# X_train = X_train.toarray()
# X_test = X_test.toarray()
# # Instantiate the logistic regression model
# #logistic_model = LogisticRegression(max_iter=1000, class_weight='balanced')
# logistic_model = GaussianNB()

# # Fit it to the training data
# logistic_model.fit(X_train, y_train)

# # We can create a set of predictions on the test dataset
# predictions = logistic_model.predict(X_test)

# # Scikit lets you directly compute the score on a dataset using the model's score method
# train_score = logistic_model.score(X_train, y_train)
# test_score = logistic_model.score(X_test, y_test)

# # Now lets view our scores!
# print("Train Accuracy:", round(train_score*100,2), "%")
# print("Test Accuracy:", round(test_score*100,2), "%")


In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Define the accuracies
train_accuracy = 83.96
test_accuracy = 41.45

# Create a bar chart
plt.figure(figsize=(8, 6))
plt.bar(['Train Accuracy', 'Test Accuracy'], [train_accuracy, test_accuracy], color=['lightgreen', 'gold'])
plt.ylim(0, 100)
plt.ylabel('Accuracy (%)')
plt.title('Model Accuracies')

# Annotate the bar chart with accuracy values
plt.text(0, train_accuracy, f'{train_accuracy}%', ha='center', va='bottom', fontsize=12)
plt.text(1, test_accuracy, f'{test_accuracy}%', ha='center', va='bottom', fontsize=12)

plt.show()


In [None]:
# Distribution of Ratings
plt.hist(y, bins=6, alpha=0.7)
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.title('Distribution of Ratings')
plt.show()

In [None]:
# Feature Importance
# feature_names = bow_cv.get_feature_names()
# coefficients = logistic_model.coef_[0]
# top_features = sorted(zip(coefficients, feature_names), reverse=True)[:20]
# plt.barh([x[1] for x in top_features], [x[0] for x in top_features])
# plt.xlabel('Coefficient')
# plt.ylabel('Feature')
# plt.title('Top 20 Features by Coefficient')
# plt.show()


# TF-IDF

In [6]:
# Step 2: Extract the preprocessed "review_text" column
reviews = df['review_text']

# Step 3: Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Step 4: Fit and transform the preprocessed review text data
tfidf_matrix = tfidf_vectorizer.fit_transform(reviews)

# Optional: If you want to see the list of features (words) extracted by TF-IDF
feature_names = tfidf_vectorizer.get_feature_names()



In [None]:
def vectorize(df):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(df['review_text'])
    y_labels = df['rating'].to_numpy()
    feature_names = vectorizer.get_feature_names()
    tfidf = pd.DataFrame(data=X.toarray(), columns=feature_names)
    x_array = X.toarray()
    return X, y_labels, vectorizer

# Step 2: Apply the vectorize function to transform the preprocessed reviews into TF-IDF vectors
X_tfidf, y_labels, vectorizer = vectorize(df)

In [None]:
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X_tfidf, y_labels, test_size=0.2)

# Use the vectorizer to create our X and y data
# X, y, vectorizer = vectorize(df)

#X_tfidf, y_labels, vectorizer = vectorize(df)

#X_tfidf will contain the TF-IDF representation of your preprocessed "review_text" column,
#y_labels will contain the corresponding labels,
#and vectorizer will be the TF-IDF vectorizer used for transformation'

In [None]:
# x_array = X_tfidf.toarray()
# print(x_array)

# Logistic Regression

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

np.random.seed(80)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_labels, test_size=0.2)

from sklearn.linear_model import LogisticRegression

# Instantiate the logistic regression model
logistic_model = LogisticRegression(max_iter=10000, class_weight='balanced')


# Fit it to the training data
logistic_model.fit(X_train, y_train)

# We can create a set of predictions on the test dataset
predictions = logistic_model.predict(X_test)

# Scikit lets you directly compute the score on a dataset using the model's score method
train_score = logistic_model.score(X_train, y_train)
test_score = logistic_model.score(X_test, y_test)

# Now lets view our scores!
print("Train Accuracy:", round(train_score*100,2), "%")
print("Test Accuracy:", round(test_score*100,2), "%")

Train Accuracy: 64.43 %
Test Accuracy: 43.37 %


In [None]:
import matplotlib.pyplot as plt

# Define the accuracies
train_accuracy = 64.43
test_accuracy = 43.36

# Create a bar chart
plt.figure(figsize=(8, 6))
plt.bar(['Train Accuracy', 'Test Accuracy'], [train_accuracy, test_accuracy], color=['aqua', 'coral'])
plt.ylim(0, 100)
plt.ylabel('Accuracy (%)')
plt.title('Model Accuracies')

# Annotate the bar chart with accuracy values
plt.text(0, train_accuracy, f'{train_accuracy}%', ha='center', va='bottom', fontsize=12)
plt.text(1, test_accuracy, f'{test_accuracy}%', ha='center', va='bottom', fontsize=12)

plt.show()


In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
import matplotlib.pyplot as plt

# Lists to store train and test accuracies
train_accuracies = []
test_accuracies = []

# Loop over a range of test sizes or any other variations you want to test
test_sizes = [0.1, 0.2, 0.3, 0.4, 0.5]

for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_labels, test_size=test_size)
    logistic_model.fit(X_train, y_train)
    train_accuracies.append(logistic_model.score(X_train, y_train))
    test_accuracies.append(logistic_model.score(X_test, y_test))

# Plotting train and test accuracies
plt.plot(test_sizes, train_accuracies, label='Train Accuracy')
plt.plot(test_sizes, test_accuracies, label='Test Accuracy')
plt.xlabel('Test Size')
plt.ylabel('Accuracy')
plt.title('Train and Test Accuracies vs Test Size')
plt.legend()
plt.show()


# TF-IDF with Cross-Validation, Multinomial Logistic Regression Solver

In [None]:
from sklearn.model_selection import train_test_split

np.random.seed(80)

X = df['review_text']
y = df['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)








In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)




In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

np.random.seed(80)
model = LogisticRegression(multi_class='multinomial', max_iter=1000)

# Perform cross-validation
cross_val_scores = cross_val_score(model, X_train_tfidf, y_train, cv=5, scoring='accuracy')

# Print the cross-validation scores
print("Cross-Validation Scores:", cross_val_scores)
print("Mean Validation Accuracy:", cross_val_scores.mean())

#fit the model
model.fit(X_train_tfidf, y_train)

#predict
y_test_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {accuracy:.2f}')

# Print a classification report for more detailed evaluation
print(classification_report(y_test, y_test_pred))


# TF-IDF with Dense Neural Network, not currently working

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Assuming X_train, X_val, y_train, y_val are your training and validation sets
# X_train and X_val are the text data, y_train and y_val are the ratings

# Step 1: Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Step 2: Build a neural network model
model = Sequential()
model.add(Dense(512, input_shape=(X_train_tfidf.shape[1],), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))  # Assuming 5 ratings (1 to 5)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Step 3: Train the neural network on the training data
model.fit(X_train_tfidf.toarray(), y_train, epochs=10, batch_size=32, validation_split=0.2)

# Step 4: Evaluate the model on the validation data
test_loss, test_accuracy = model.evaluate(X_test_tfidf.toarray(), y_test)
print(f'Test Accuracy: {test_accuracy:.2f}')

# Step 5: Predict on the validation data
y_test_pred = model.predict_classes(X_test_tfidf.toarray())

# Print a classification report for more detailed evaluation
print(classification_report(y_test, y_test_pred))

# Attempt at sentiment analysis with GaussianMixtures

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture

np.random.seed(80)

X = df['review_text']
y = df['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

sentiments = []
sentiments.append([])
sentiments.append([])
sentiments.append([])
sentiments.append([])
sentiments.append([])
sentiments.append([])


#create lists of sentiment scores by rating
for i in range(len(X_train)):
    sia = SentimentIntensityAnalyzer()
    compound_score = sia.polarity_scores(X_train.iloc[i])['compound']
    sentiments[y_train.iloc[i]].append([compound_score])


In [None]:
#create guassianmixtures
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture

np.random.seed(80)

covs = ['diag', 'full']
components = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
accuracies = []

for component in components:
    for cov in covs:
        models = []

        for ratings in sentiments:
            gmm = GaussianMixture(n_components=7, covariance_type='diag', n_init=3)
            gmm.fit(ratings)
            models.append(gmm)

        #predict train scores


        y_pred = []
        for review in X_test:
            scores = []
            sia = SentimentIntensityAnalyzer()
            compound_score = sia.polarity_scores(review)['compound']
            for m in models:
                score = m.score([[compound_score]])
                scores.append(score)
            y_pred.append(np.argmax(scores))


        #calculate accuracy

        total = len(y_test)
        correct = 0
        for i in range(len(y_test)):
            if y_pred[i] == y_test.iloc[i]:
                correct = correct + 1
        test_accuracy = float(correct)/float(total)
        print("Test Accuracy with covariance: ", cov, " and ", component, " components: ", round(test_accuracy*100,2), "%")
        accuracies.append([test_accuracy, cov, component])
        
best_acc = 0
best_cov = ""
best_comp = 0
for accuracy in accuracies:
    if accuracy[0] > best_acc:
        best_acc = accuracy[0]
        best_cov = accuracy[1]
        best_comp = accuracy[2]
print("Best Test Accuracy with covariance: ", best_cov, " and ", best_comp, " components: ", round(best_acc*100,2), "%")

# SVM

In [None]:
from sklearn import svm
# attempting multiclass classification
# one-versus-one approach
X_train, X_test, y_train, y_test = train_test_split(X_cv, y, test_size=0.2)
# X = [X_train]
# Y = [0,1,2,3,4,5]
clf = svm.SVC(decision_function_shape='ovo')
clf.fit(X_train, y_train)
print("Decision function shape (before change):", clf.decision_function([[1]])) # Output should be the shape of ovo

# Change decision function shape
clf.decision_function_shape = "ovr"

# Check decision function shape again
print("Decision function shape (after change):", clf.decision_function([[1]])) # Output should be the shape of ovr
# SVC(decision_function_shape='ovo')
# dec = clf.decision_function([[1]])
# dec.shape[1] # 6 classes: 6*3/2 = 9
# clf.decision_function_shape = "ovr"
# dec = clf.decision_function([[1]])
# dec.shape[1] # 6 classes

# K-means

In [1]:
from sklearn.model_selection import train_test_split
X_tfidf, y_labels, vectorizer = vectorize(df)
X_original = df.drop(['rating', 'review_text'])
df2 = X_original.assign(X_tfidf=X_tfidf)
X_train, X_test, y_train, y_test = train_test_split(df2, y_labels, test_size=0.2)

# from sklearn.feature_extraction.text import TfidfVectorizer

# tfidf_vectorizer = TfidfVectorizer()
# X_train_tfidf = tfidf_vectorizer.fit_transform(X_train["review_text"])
# X_test_tfidf = tfidf_vectorizer.transform(X_test)

# X_tfidf, y_labels, vectorizer = vectorize(df)

# # Convert X_tfidf to a DataFrame
# X_tfidf_df = pd.DataFrame(X_tfidf.toarray())

# # Remove 'rating' and 'review_text' columns from df
# X_original = df.drop(['rating', 'review_text'])

# # Concatenate X_tfidf_df with X_original
# X_all = pd.concat([X_original, X_tfidf_df], axis=1)

# # Splitting the data
# X_train, X_test, y_train, y_test = train_test_split(X_all, y_labels, test_size=0.2, random_state=42)


NameError: name 'vectorize' is not defined

In [19]:
from sklearn.cluster import KMeans

cluster_count = 6
km = KMeans(n_clusters = cluster_count, # The number of clusters we want
            n_init = 10, # This is the number of time the algorithm runs - the result is the one with minimum SSE (sum of squared errors)
            init='random', # How the centroids are initialized
            max_iter = 100,
            random_state = 42)

preds = km.fit_predict(X_train_tfidf)

In [20]:
# theres something wrong with this 

def plotClusters(X_train_tfidf, preds, km, cluster_count, colorlist):
    # Convert sparse matrix to dense array
    X_train_dense = X_train.toarray()
    for i in range(cluster_count):
        plt.scatter(X_train_dense[preds == i, 0],
                    X_train_dense[preds == i, 1],
                    marker="o",
                    c=colorlist[i],
                    label="Cluster " + str(i + 1))
    # plot the centroids
    plt.scatter(km.cluster_centers_[:, 0],
                km.cluster_centers_[:, 1],
                s=250,
                marker='*',
                c='red',
                edgecolor='black',
                label='Centroids')
    plt.legend(scatterpoints=1)
    plt.grid()
    plt.show()
        
plotClusters(X_train_tfidf, preds, km, cluster_count, ['lightgreen', 'lightblue', 'blue', 'pink', 'yellow', 'orange'])

AttributeError: 'DataFrame' object has no attribute 'toarray'

In [None]:
# Calculate within-cluster SSE varying the number of clusters
# The elbow graph will show a sharp decrease in the intra-cluster SSE till a certain point, after which the reduction is slow
# This elbow point is the optimal number of clusters

cluster_sse = []
for i in range(1, 11):
    km = KMeans(n_clusters=i, 
                init='random',
                n_init=10, 
                max_iter=100,
                random_state=42)
    km.fit(X_train)
    cluster_sse.append(km.inertia_)

# plot
plt.plot(range(1, 11), cluster_sse, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Intra-Cluster SSE')
plt.show()

In [None]:
cluster_count = 2
km = KMeans(n_clusters = cluster_count, # The number of clusters we want
            n_init = 10, # This is the number of time the algorithm runs - the result is the one with minimum SSE (sum of squared errors)
            init='random', # How the centroids are initialized
            max_iter = 100,
            random_state = 42)

preds = km.fit_predict(X_train)

plotClusters(X_train, preds, km, cluster_count, ['lightgreen', 'lightblue', 'blue', 'pink', 'yellow', 'orange'])

# Gradient Boosting

In [2]:
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, max_features = 21, random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)

NameError: name 'X_train' is not defined

In [None]:
param_grid = {
    'n_estimators': [50, 100, 150],  # Number of trees in the forest
    'max_depth': [None, 10, 20],       # Maximum depth of the trees
    'learning_rate': [0.01, 0.2, 0.5],   # Minimum number of samples required to split an internal node
    'random_state': [0, 50, 75]      # Minimum number of samples required to be at a leaf node
}

# Instantiate the Random Forest classifier
clf = GradientBoostingClassifier()

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters found by grid search
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model found by grid search
best_model = grid_search.best_estimator_

# Evaluate the best model on the test data
test_score = best_model.score(X_test, y_test)
print("Test Accuracy with Best Model:", round(test_score * 100, 2), "%")

# Random Forest Model 

In [None]:
#Ryan Galligan

In [14]:
from sklearn.ensemble import RandomForestClassifier

#X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X_tfidf, y_labels, test_size=0.2)

# Instantiate the Random Forest classifier
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit it to the training data
random_forest_model.fit(X_train_rf, y_train_rf)

# Predictions on the test dataset
predictions = random_forest_model.predict(X_test_rf)

# Compute the accuracy scores
train_score = random_forest_model.score(X_train_rf, y_train_rf)
test_score = random_forest_model.score(X_test_rf, y_test_rf)

# Output the scores
print("Train Accuracy:", round(train_score * 100, 2), "%")
print("Test Accuracy:", round(test_score * 100, 2), "%")

Train Accuracy: 99.67 %
Test Accuracy: 42.42 %


In [22]:
from sklearn.model_selection import GridSearchCV

# # Define the parameter grid to search
# param_grid = {
#     'n_estimators': [50, 100, 150],  # Number of trees in the forest
#     'max_depth': [None, 10, 20],       # Maximum depth of the trees
#     'min_samples_split': [2, 5, 10],   # Minimum number of samples required to split an internal node
#     'min_samples_leaf': [1, 2, 4]      # Minimum number of samples required to be at a leaf node
# }

# Define a simpler parameter grid
param_grid = {
    'n_estimators': [12, 25, 50, 100, 150, 200],       # Fewer trees to reduce complexity
    'max_depth': [None, 2, 5, 8, 15, 20, 25],        # No limit on depth
    'min_samples_split': [2, 5, 10, 15],   # Minimum samples required to split
    'min_samples_leaf': [1, 2, 4]     # Keep it simple with only one value
}

# Instantiate the Random Forest classifier
random_forest_model = RandomForestClassifier(random_state=42)

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=random_forest_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit grid search to the training data
grid_search.fit(X_train_rf, y_train_rf)

# Get the best hyperparameters found by grid search
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model found by grid search
best_model = grid_search.best_estimator_

# Evaluate the best model on the test data
test_score = best_model.score(X_test_rf, y_test_rf)
print("Test Accuracy with Best Model:", round(test_score * 100, 2), "%")




Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 150}
Test Accuracy with Best Model: 42.64 %


In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100],        # Reducing the number of trees
    'max_depth': [None, 10],          # Limiting the maximum depth
    'min_samples_split': [5, 10],     # Increasing min_samples_split
    'min_samples_leaf': [1, 2]        # Adjusting min_samples_leaf
}

# # Instantiate the Random Forest classifier
# random_forest_model = RandomForestClassifier(random_state=42)

# # Perform grid search with 5-fold cross-validation
# grid_search = GridSearchCV(estimator=random_forest_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# # Fit grid search to the training data
# grid_search.fit(X_train_rf, y_train_rf)

# Get the best hyperparameters found by grid search
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the results of cross-validation
cv_results = grid_search.cv_results_
    
# Get the best model found by grid search
best_model = grid_search.best_estimator_

# Evaluate the best model on the test data
test_score = best_model.score(X_test_rf, y_test_rf)
print("Test Accuracy with Best Model:", round(test_score * 100, 2), "%")


Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Test Accuracy with Best Model: 42.55 %


In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define a simpler parameter grid
param_grid = {
    'n_estimators': [12, 25, 50, 75],       # Fewer trees to reduce complexity
    'max_depth': [2, 5, 8, 15],        # No limit on depth
    'min_samples_split': [5, 10, 15],   # Minimum samples required to split
    'min_samples_leaf': [1, 2]     # Keep it simple with only one value
}

# Instantiate the Random Forest classifier
random_forest_model = RandomForestClassifier(random_state=42)

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=random_forest_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit grid search to the training data
grid_search.fit(X_train_rf, y_train_rf)

# Get the best hyperparameters found by grid search
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model found by grid search
best_model = grid_search.best_estimator_

# Evaluate the best model on the test data
test_score = best_model.score(X_test_rf, y_test_rf)
print("Test Accuracy with Best Model:", round(test_score * 100, 2), "%")


Best Hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 15, 'n_estimators': 50}
Test Accuracy with Best Model: 36.39 %


In [23]:
##BEST MODEL
#Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 150}
#Test Accuracy with Best Model: 42.64 %
from sklearn.ensemble import RandomForestClassifier

# Instantiate the Random Forest classifier with specific hyperparameters
random_forest_model = RandomForestClassifier(n_estimators=150, max_depth=None, min_samples_split=10, min_samples_leaf=1, random_state=42)

# Fit the model to the training data
random_forest_model.fit(X_train_rf, y_train_rf)

# Evaluate the model on the test data
test_score = random_forest_model.score(X_test_rf, y_test_rf)
print("Test Accuracy:", round(test_score * 100, 2), "%")


Test Accuracy: 42.64 %
