In [None]:

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score,f1_score
from sklearn.metrics import mean_squared_error,roc_auc_score,precision_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier,ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
pd.options.display.max_columns = 999



In [None]:
df = pd.read_csv (r'/content/data.csv')

In [None]:
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
df.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in df.columns.values]

In [None]:
df['extracted_sentence'] = df['extracted_sentence'].fillna('').astype(str)
df['Label'] = df['Label'].fillna('').astype(str)
df = df[df['extracted_sentence'] != '']
df = df[df['Label'] != '']
X = df['extracted_sentence']
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
print("Unique labels in the dataset:", df['Label'].unique())

# Split the dataset into features and labels
X = df['extracted_sentence']
y = df['Label']

# Count original dataset labels with the corrected logic
print("Original dataset - Positive samples:", (y.str.lower() == 'positive').sum())
print("Original dataset - Negative samples:", (y.str.lower() == 'negative').sum())

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)

# Check training and testing set sizes
print("Number of training samples:", X_train.shape[0])
print("Number of testing samples:", X_test.shape[0])

# Count training set labels
train_positive_count = (y_train.str.lower() == 'positive').sum()
train_negative_count = (y_train.str.lower() == 'negative').sum()

# Count testing set labels
test_positive_count = (y_test.str.lower() == 'positive').sum()
test_negative_count = (y_test.str.lower() == 'negative').sum()

# Print the counts
print("Training set - Positive samples:", train_positive_count)
print("Training set - Negative samples:", train_negative_count)
print("Testing set - Positive samples:", test_positive_count)
print("Testing set - Negative samples:", test_negative_count)

Unique labels in the dataset: ['Negative' 'positive']
Original dataset - Positive samples: 0
Original dataset - Negative samples: 0
Number of training samples: 648
Number of testing samples: 162
Training set - Positive samples: 0
Training set - Negative samples: 0
Testing set - Positive samples: 0
Testing set - Negative samples: 0


In [None]:
# Ensure the number of samples in X_train_tfidf and y_train matches
assert X_train_tfidf.shape[0] == y_train.shape[0], "Mismatch in number of samples between X_train_tfidf and y_train"

# Train a Logistic Regression model
logreg = LogisticRegression(solver='lbfgs', max_iter=1000)
logreg.fit(X_train_tfidf, y_train)

# Predict on the test data
y_pred_test = logreg.predict(X_test_tfidf)

# Calculate accuracy and F1 score
acc = accuracy_score(y_test, y_pred_test)
fscore = f1_score(y_test, y_pred_test, average='weighted')  # For multiclass use 'weighted'

# Print the results
print("Accuracy of Logistic Regression: %.2f%%" % (acc * 100.0))
print("F1 Score of Logistic Regression: %.2f%%" % (fscore * 100.0))

Accuracy of Logistic Regression: 80.86%
F1 Score of Logistic Regression: 73.88%


In [None]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

gnb = GaussianNB()
gnb.fit(X_train_tfidf.toarray(), y_train)
y_pred_test1 = gnb.predict(X_test_tfidf.toarray())

fscore = f1_score(y_test, y_pred_test1, average='weighted')
acc = accuracy_score(y_test, y_pred_test1)

print("Accuracy of GaussianNB: %.2f%%" % (acc * 100.0))
print("F1 Score of GaussianNB: %.2f%%" % (fscore * 100.0))

Accuracy of GaussianNB: 85.19%
F1 Score of GaussianNB: 84.19%


In [None]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

clf = KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree')
clf.fit(X_train_tfidf.toarray(), y_train)
y_pred3 = clf.predict(X_test_tfidf.toarray())
acc3 = accuracy_score(y_test, y_pred3)
fscore = f1_score(y_test, y_pred3, average='weighted')

print("Accuracy of KNN: %.2f%%" % (acc3 * 100.0))
print("F1 Score of KNN: %.2f%%" % (fscore * 100.0))

Accuracy of KNN: 82.72%
F1 Score of KNN: 81.55%


In [None]:
vectorizer = TfidfVectorizer()

# Fit and transform the training data, and transform the test data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

# Initialize and train the SVM model
svc1 = SVC(C=50, kernel='rbf', gamma=1)

# Fit the model using the resampled training data
svc1.fit(X_train_resampled, y_train_resampled)

# Predict on the test data
y_pred4 = svc1.predict(X_test_tfidf)

# Calculate accuracy and F1 score
acc4 = accuracy_score(y_test, y_pred4)
fscore = f1_score(y_test, y_pred4, average='weighted')

# Print the results
print("Accuracy of SVM: %.2f%%" % (acc4 * 100.0))
print("Fscore of SVM: %.2f%%" % (fscore * 100.0))

Accuracy of SVM: 86.42%
Fscore of SVM: 84.73%


In [None]:
vectorizer = TfidfVectorizer()

# Fit and transform the training data, and transform the test data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_tfidf, y_train)

# Initialize and train the Decision Tree model
dt = DecisionTreeClassifier(random_state=42)

# Fit the model using the SMOTE-processed training data
dt.fit(X_train_smote, y_train_smote)

# Predict on the test data
y_pred67 = dt.predict(X_test_tfidf)

# Calculate accuracy and F1 score
acc2 = accuracy_score(y_test, y_pred67)
fscore = f1_score(y_test, y_pred67, average='weighted')

# Print the results
print("Accuracy of Decision Tree: %.2f%%" % (acc2 * 100.0))
print("Fscore of Decision Tree: %.2f%%" % (fscore * 100.0))

Accuracy of Decision Tree: 79.01%
Fscore of Decision Tree: 79.01%


In [None]:
vectorizer = TfidfVectorizer()

# Fit and transform the training data, and transform the test data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_tfidf, y_train)

# Initialize and train the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Fit the model using the SMOTE-processed training data
rf.fit(X_train_smote, y_train_smote)

# Predict on the test data
y_pred2 = rf.predict(X_test_tfidf)

# Calculate accuracy and F1 score
acc2 = accuracy_score(y_test, y_pred2)
fscore = f1_score(y_test, y_pred2, average='weighted')

# Print the results
print("Accuracy of Random Forest: %.2f%%" % (acc2 * 100.0))
print("Fscore of Random Forest: %.2f%%" % (fscore * 100.0))

Accuracy of Random Forest: 85.80%
Fscore of Random Forest: 83.88%


In [None]:
vectorizer = TfidfVectorizer()

# Fit and transform the training data, and transform the test data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_tfidf, y_train)

# Extra Trees Classifier
et = ExtraTreesClassifier(random_state=42)
et.fit(X_train_smote, y_train_smote)
y_pred21 = et.predict(X_test_tfidf)

fscore_et = f1_score(y_test, y_pred21, average='weighted')
acc_et = accuracy_score(y_test, y_pred21)

print("Accuracy of Extra Trees: %.2f%%" % (acc_et * 100.0))
print("Fscore of Extra Trees: %.2f%%" % (fscore_et * 100.0))

# AdaBoost Classifier
ab = AdaBoostClassifier(random_state=42)
ab.fit(X_train_smote, y_train_smote)
y_pred22 = ab.predict(X_test_tfidf)

fscore_ab = f1_score(y_test, y_pred22, average='weighted')
acc_ab = accuracy_score(y_test, y_pred22)

print("Accuracy of AdaBoost: %.2f%%" % (acc_ab * 100.0))
print("Fscore of AdaBoost: %.2f%%" % (fscore_ab * 100.0))

Accuracy of Extra Trees: 88.27%
Fscore of Extra Trees: 86.68%




Accuracy of AdaBoost: 77.78%
Fscore of AdaBoost: 78.01%
