
## Group 12 - Python Code ##

#### Imported the necessary libraries 

In [89]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dipitabiswas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dipitabiswas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dipitabiswas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Loaded the dataset (939 records)

In [90]:

df = pd.read_csv("Craigslist_Gigs_Boston.csv")  

# Check
print("Rows:", df.shape[0])
df.head()

Rows: 939


Unnamed: 0.1,Unnamed: 0,post_datetime,post_title_link,post_title_text,post_hood,detail_body,Completed,pay_from_post,pay_rate,description
0,0,2022-09-26T16:16:39.000Z,https://boston.craigslist.org/gbs/lbg/75385228...,* Deliver with DoorDash *,boston/cambridge/brookline,,True,,lump,Far skin small then. Gig available immediately...
1,1,2022-09-26T16:13:53.000Z,https://boston.craigslist.org/gbs/dmg/75385300...,Up to $300 Paid Research for Toyota / Lexus Dr...,Boston,,True,300.0,lump,"Need part-time driver, flexible hours, pay per..."
2,2,2022-09-26T16:08:33.000Z,https://boston.craigslist.org/gbs/lbg/75385227...,Grubhub Delivery Driver - No Experience Needed!,Boston,,True,,hourly,"Urgent delivery gig: lightweight items, 2-4 ho..."
3,3,2022-09-26T15:57:42.000Z,https://boston.craigslist.org/gbs/tlg/75385206...,"The most fun work at live events! 52,000 real ...",Greater Boston area,,True,30.0,hourly,"Event gig: greeting guests, handing out materi..."
4,4,2022-09-26T15:44:01.000Z,https://boston.craigslist.org/gbs/lbg/75385138...,💸📈💸📈LAWN CARE PROS - MAKE UP TO $1000 PER WEEK,boston/cambridge/brookline,,True,1000.0,weekly,Into heart speak challenge turn. Gig available...


#### Preprocessing Data (Lower case, Lemmatization, Stop words removal)

In [91]:
#import necessary libraries

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text_lite(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()  
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)


df['clean_description'] = df['description'].apply(preprocess_text_lite)

# Preview
df[['description', 'clean_description']].head()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dipitabiswas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dipitabiswas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,description,clean_description
0,Far skin small then. Gig available immediately...,far skin small gig available immediately local...
1,"Need part-time driver, flexible hours, pay per...",need parttime driver flexible hour pay per trip
2,"Urgent delivery gig: lightweight items, 2-4 ho...",urgent delivery gig lightweight item hour
3,"Event gig: greeting guests, handing out materi...",event gig greeting guest handing material
4,Into heart speak challenge turn. Gig available...,heart speak challenge turn gig available immed...


#### Word Frequency Normalization (TF-IDF with 1–2 grams, min_df=3)

In [92]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF 
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=3)
X = vectorizer.fit_transform(df['clean_description'])

print("TF-IDF matrix shape:", X.shape)



TF-IDF matrix shape: (939, 1041)


#### Descriptive Modeling: K-Means Clustering


In [93]:

from sklearn.cluster import KMeans

k = 4  # Number of clusters
kmeans = KMeans(n_clusters=k, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

# Top keywords per cluster
terms = vectorizer.get_feature_names_out()
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

print("\nTop 10 Keywords per Cluster:\n")
for i in range(k):
    top_terms = [terms[ind] for ind in order_centroids[i, :10]]
    print(f"Cluster {i}: {', '.join(top_terms)}")




Top 10 Keywords per Cluster:

Cluster 0: area, site, immediately local, available, local area, gig available, immediately, available immediately, senior, local
Cluster 1: need, assist, furniture, needed, assist moving, moving, driver, help, box, parttime
Cluster 2: act, let, tax, area, local area, available immediately, available, immediately, immediately local, gig available
Cluster 3: area, local, available, immediately, gig available, available immediately, immediately local, local area, gig, event


Inferred Themes from Clustering Output:


Cluster 0 – Location-Based Gig Availability:
Keywords like area, site, immediately local, available, gig available, local area, senior suggest postings focused on local short-term gigs often involving specific locations or site-based tasks, possibly for elderly care or house help.

Cluster 1 – Moving & Manual Labor Assistance:
Keywords like assist, furniture, moving, driver, help, box, parttime point to gigs that involve physical help or moving assistance, such as lifting, delivery, or driving.

Cluster 2 – Legal or Admin Tasks (Possible Spam/Irrelevant):
Keywords like act, let, tax mixed with area, available immediately are a bit vague but may suggest legal or formal tasks or potentially spammy or unclear listings that reuse common tokens.

Cluster 3 – General Local Availability/Repeat Phrases:
Keywords like area, local, available, immediately are highly repetitive and generic. This cluster likely captures general-purpose or poorly written gigs that frequently use availability phrases.



#### Sentiment Label Creation using VADER

In [94]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()

def get_sentiment_label(text):
    score = sid.polarity_scores(str(text))['compound']
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df['sentiment_label'] = df['description'].apply(get_sentiment_label)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/dipitabiswas/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


#### Train-Test Split (70/30) for Sentiment Classification

In [95]:
from sklearn.model_selection import train_test_split

# Define X and y
y = df['sentiment_label']

# 70% train, 30% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print("Training size:", X_train.shape[0])
print("Test size:", X_test.shape[0])


Training size: 657
Test size: 282


#### Model & Validation: Naive Bayes Model 

In [96]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

#Train the Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

#Make predictions on train and test sets
y_pred_nb_train = nb_model.predict(X_train)
y_pred_nb = nb_model.predict(X_test)

#Calculate training and test accuracy
train_accuracy_nb = accuracy_score(y_train, y_pred_nb_train)
test_accuracy_nb = accuracy_score(y_test, y_pred_nb)

#Evaluation results
print("\n====== Naive Bayes Classification Report ======\n")
print(classification_report(y_test, y_pred_nb))

print("\n====== Confusion Matrix ======\n")
print(confusion_matrix(y_test, y_pred_nb))

print("\n====== Accuracy Summary ======\n")
print(f"Naive Bayes Train Accuracy: {train_accuracy_nb * 100:.2f}%")
print(f"Naive Bayes Test Accuracy: {test_accuracy_nb * 100:.2f}%")







              precision    recall  f1-score   support

    negative       0.00      0.00      0.00        34
     neutral       0.73      0.80      0.76       124
    positive       0.68      0.81      0.74       124

    accuracy                           0.71       282
   macro avg       0.47      0.53      0.50       282
weighted avg       0.62      0.71      0.66       282



[[  0  12  22]
 [  0  99  25]
 [  0  24 100]]


Naive Bayes Train Accuracy: 85.69%
Naive Bayes Test Accuracy: 70.57%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Model & Validation: Logistic Regression Model

In [97]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Train the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Make predictions on train and test sets
y_pred_lr_train = lr_model.predict(X_train)
y_pred_lr = lr_model.predict(X_test)

# Calculate training and test accuracy
train_accuracy_lr = accuracy_score(y_train, y_pred_lr_train)
test_accuracy_lr = accuracy_score(y_test, y_pred_lr)

# Evaluation results
print("\n====== Logistic Regression Classification Report ======\n")
print(classification_report(y_test, y_pred_lr))

print("\n====== Confusion Matrix ======\n")
print(confusion_matrix(y_test, y_pred_lr))

print("\n====== Accuracy Summary ======\n")
print(f"Logistic Regression Train Accuracy: {train_accuracy_lr * 100:.2f}%")
print(f"Logistic Regression Test Accuracy: {test_accuracy_lr * 100:.2f}%")





              precision    recall  f1-score   support

    negative       0.00      0.00      0.00        34
     neutral       0.70      0.84      0.76       124
    positive       0.71      0.76      0.73       124

    accuracy                           0.70       282
   macro avg       0.47      0.53      0.50       282
weighted avg       0.62      0.70      0.66       282



[[  0  15  19]
 [  0 104  20]
 [  0  30  94]]


Logistic Regression Train Accuracy: 89.95%
Logistic Regression Test Accuracy: 70.21%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Model & Validation: Random Forest Model

In [98]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on train and test sets
y_pred_rf_train = rf_model.predict(X_train)
y_pred_rf = rf_model.predict(X_test)

# Calculate training and test accuracy
train_accuracy_rf = accuracy_score(y_train, y_pred_rf_train)
test_accuracy_rf = accuracy_score(y_test, y_pred_rf)

# Evaluation results
print("\n====== Random Forest Classification Report ======\n")
print(classification_report(y_test, y_pred_rf))

print("\n====== Confusion Matrix ======\n")
print(confusion_matrix(y_test, y_pred_rf))

print("\n====== Accuracy Summary ======\n")
print(f"Random Forest Train Accuracy: {train_accuracy_rf * 100:.2f}%")
print(f"Random Forest Test Accuracy: {test_accuracy_rf * 100:.2f}%")





              precision    recall  f1-score   support

    negative       0.00      0.00      0.00        34
     neutral       0.67      0.77      0.72       124
    positive       0.65      0.73      0.69       124

    accuracy                           0.66       282
   macro avg       0.44      0.50      0.47       282
weighted avg       0.58      0.66      0.62       282



[[ 0 14 20]
 [ 0 96 28]
 [ 0 33 91]]


Random Forest Train Accuracy: 100.00%
Random Forest Test Accuracy: 66.31%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Model & Validation: Support Vector Machine (SVM) Model

In [99]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Train the SVM model
svm_model = LinearSVC(random_state=42, max_iter=10000)
svm_model.fit(X_train, y_train)

# Make predictions on train and test sets
y_pred_svm_train = svm_model.predict(X_train)
y_pred_svm = svm_model.predict(X_test)

# Calculate training and test accuracy
train_accuracy_svm = accuracy_score(y_train, y_pred_svm_train)
test_accuracy_svm = accuracy_score(y_test, y_pred_svm)

# Step 5: Print evaluation results
print("\n====== SVM Classification Report ======\n")
print(classification_report(y_test, y_pred_svm))

print("\n====== Confusion Matrix ======\n")
print(confusion_matrix(y_test, y_pred_svm))

print("\n====== Accuracy Summary ======\n")
print(f"SVM Train Accuracy: {train_accuracy_svm * 100:.2f}%")
print(f"SVM Test Accuracy: {test_accuracy_svm * 100:.2f}%")





              precision    recall  f1-score   support

    negative       0.43      0.18      0.25        34
     neutral       0.71      0.82      0.76       124
    positive       0.74      0.74      0.74       124

    accuracy                           0.71       282
   macro avg       0.63      0.58      0.58       282
weighted avg       0.69      0.71      0.69       282



[[  6  14  14]
 [  3 102  19]
 [  5  27  92]]


SVM Train Accuracy: 100.00%
SVM Test Accuracy: 70.92%


#### Model & Validation: Decision Tree Model 

In [100]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Train the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Make predictions on train and test sets
y_pred_dt_train = dt_model.predict(X_train)
y_pred_dt = dt_model.predict(X_test)

# Calculate training and test accuracy
train_accuracy_dt = accuracy_score(y_train, y_pred_dt_train)
test_accuracy_dt = accuracy_score(y_test, y_pred_dt)

# Evaluation results
print("\n====== Decision Tree Classification Report ======\n")
print(classification_report(y_test, y_pred_dt))

print("\n====== Confusion Matrix ======\n")
print(confusion_matrix(y_test, y_pred_dt))

print("\n====== Accuracy Summary ======\n")
print(f"Decision Tree Train Accuracy: {train_accuracy_dt * 100:.2f}%")
print(f"Decision Tree Test Accuracy: {test_accuracy_dt * 100:.2f}%")






              precision    recall  f1-score   support

    negative       0.42      0.24      0.30        34
     neutral       0.73      0.86      0.79       124
    positive       0.78      0.73      0.75       124

    accuracy                           0.73       282
   macro avg       0.64      0.61      0.61       282
weighted avg       0.71      0.73      0.71       282



[[  8  13  13]
 [  4 107  13]
 [  7  27  90]]


Decision Tree Train Accuracy: 100.00%
Decision Tree Test Accuracy: 72.70%
