### Data Wrangling

In [None]:
from google.colab import drive

drive.mount('/content/gdrive', force_remount=True)

root_dir = "/content/gdrive/My Drive/NLP Sentiment Analysis"

In [1]:
with open(f'train_neg_reviews.txt',encoding='utf-8') as f:
  contents = f.read()
  train_neg_reviews = [review[len('4\t'):] for review in contents.split('\n')]

with open(f'train_pos_reviews.txt',encoding='utf-8') as f:
  contents = f.read()
  train_pos_reviews = [review[len('4\t'):] for review in contents.split('\n')]

with open(f'test_neg_reviews.txt',encoding='utf-8') as f:
  contents = f.read()
  test_neg_reviews = [review[len('4\t'):] for review in contents.split('\n')]

with open(f'test_pos_reviews.txt',encoding='utf-8') as f:
  contents = f.read()
  test_pos_reviews = [review[len('4\t'):] for review in contents.split('\n')]

In [2]:
train_docs = train_neg_reviews + train_pos_reviews
y_train = [0]*len(train_neg_reviews) + [1]*len(train_pos_reviews)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(train_docs)
X_train = vectorizer.transform(train_docs)

In [4]:
test_docs = test_neg_reviews + test_pos_reviews
y_test = [0]*len(test_neg_reviews) + [1]*len(test_pos_reviews)
X_test = vectorizer.transform(test_docs)



X = vectorizer.transform(train_docs+test_docs)
y = y_train + y_test

print(X_train.shape, len(y_train), X_test.shape, len(y_test), X.shape, len(y))

(12500, 56831) 12500 (25000, 56831) 25000 (37500, 56831) 37500


A random forest is a type of ensemble machine learning model that is made up of multiple decision trees. Ensemble models combine the predictions of multiple individual models to make more accurate predictions. In a random forest, each decision tree is trained on a random subset of the data, and the final prediction is made by averaging the predictions of all the individual decision trees.

Here is an example of how to train a random forest using the scikit-learn library in Python:


## Model Fitting

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier with 100 trees
model = RandomForestClassifier()

# Train the model on training data
model.fit(X_train, y_train)

# Score
model.score(X_test, y_test)

In [None]:
# get feature (word) importances
print(model.feature_importances_)

Grid search can help us find the best parameters

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid for the model
param_grid = {
    'n_estimators': [10, 100, 1000],
    'max_depth': [5, 10, 50, 100],
    'min_impurity_decrease': [0, 0.1, 1],
    'max_features': [1, 10, 100, 1000, None]
}

model_grid = RandomForestClassifier()

# Use GridSearchCV to search for the best hyperparameters
clf = GridSearchCV(model_grid, param_grid, cv=5)


clf.fit(X, y)

# Print the best hyperparameters
print(f"Best hyperparameters: {clf.best_params_}. Score: {clf.best_score_:.2f}")

In [None]:
# Import the necessary libraries
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_classification

# Create a gradient boosting classifier
clf = GradientBoostingClassifier()

# Train the classifier on the data
clf.fit(X_train, y_train)

# Make predictions on new data
clf.score(X_test, y_test)

In [None]:
import xgboost as xgb

# Create the XGBoost model
model = xgb.XGBClassifier()

# Train the model on the training data
model.fit(X_train, y_train)

# Evaluate the model on the test data
accuracy = model.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

## Contextual Polarity

In [None]:
from sklearn.linear_model import LogisticRegression
import numpy as np
model = LogisticRegression()
model.fit(X_train, y_train)
feature_names = np.array(vectorizer.get_feature_names())
sorted_coef_index = model.coef_[0].argsort()
print("Negative Words", feature_names[sorted_coef_index[:10]])
print("Positive Words", feature_names[sorted_coef_index[-10:]])

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import classification_report

names = [
    #"Nearest Neighbors",
    #"Linear SVM",
    #"RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    #KNeighborsClassifier(30),
    #SVC(kernel="linear", C=0.025),
    #SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=50),
    RandomForestClassifier(max_depth=50, n_estimators=100, max_features=10),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]


#figure = plt.figure(figsize=(27, 9))
i = 1
# preprocess dataset, split into training and test part

X_normal = StandardScaler(with_mean=False).fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=42
)

x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5

scores = {}
# iterate over classifiers
for name, clf in zip(names, classifiers):
    clf.fit(X_train.toarray(), y_train)
    #score = clf.score(X_test, y_test)
    print(name)#, score)
    #scores[name] = score
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))

In [None]:
# doc2vec embeddings (instead of TF-IDF)
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Create a list of TaggedDocument objects
documents = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(train_docs)]

# Initialize the Doc2Vec model
model = Doc2Vec(vector_size=300, min_count=1, epochs=50)

# Build the vocabulary
model.build_vocab(documents)

# Train the model
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

# Generate vector representation for a document
X_train = [model.infer_vector(doc) for doc in train_docs]