In [10]:
#pip install bayesian-optimization


Collecting bayesian-optimization
  Downloading bayesian_optimization-1.5.1-py3-none-any.whl.metadata (16 kB)
Downloading bayesian_optimization-1.5.1-py3-none-any.whl (28 kB)
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.5.1
Note: you may need to restart the kernel to use updated packages.


In [40]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from bayes_opt import BayesianOptimization
#nltk.download('punkt')
#nltk.download('stopwords')

In [69]:
# Load the dataset
data = pd.read_csv(r'C:\Users\ayush\Downloads\0000000000002747_training_twitter_x_y_train.csv')

# Select relevant columns
data = data[['text', 'airline_sentiment']]
data['airline_sentiment'] = data['airline_sentiment'].map({'positive': 1, 'neutral': 0, 'negative': -1})

# Text preprocessing function
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lower case
    tokens = [word for word in tokens if word.isalnum()]  # Remove punctuation and numbers
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(tokens)

# Apply text preprocessing
data['text'] = data['text'].apply(preprocess_text)

# Convert text data to numerical data using TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['text']).toarray()
y = data['airline_sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [70]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

# Define the optimization function for Random Forest
def optimize_rf(n_estimators, max_depth, min_samples_split):
    # Convert float parameters to integers since these are required by RandomForestClassifier
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)
    min_samples_split = int(min_samples_split)
    
    # Create the Random Forest model with given parameters
    rf_clf = RandomForestClassifier(n_estimators=n_estimators, 
                                    max_depth=max_depth, 
                                    min_samples_split=min_samples_split, 
                                    random_state=42)
    
    # Perform cross-validation to evaluate the model's performance
    scores = cross_val_score(rf_clf, X_train, y_train, cv=3, scoring='accuracy')
    
    
    # Return the mean of the cross-validation scores as the objective function value
    return scores.mean()
    


In [71]:
from bayes_opt import BayesianOptimization

# Define the parameter bounds for the Bayesian optimizer
param_bounds = {
    'n_estimators': (10, 300),        # Number of trees in the forest
    'max_depth': (3, 30),             # Maximum depth of the tree
    'min_samples_split': (2, 20)      # Minimum number of samples required to split an internal node
}

# Run Bayesian Optimization
optimizer = BayesianOptimization(f=optimize_rf, pbounds=param_bounds, random_state=42)
optimizer.maximize(init_points=5, n_iter=20)

# Extract the best parameters found by the optimizer
best_params = optimizer.max['params']
best_params['n_estimators'] = int(best_params['n_estimators'])
best_params['max_depth'] = int(best_params['max_depth'])
best_params['min_samples_split'] = int(best_params['min_samples_split'])

# Output the best parameters
print("Best Parameters:", best_params)


|   iter    |  target   | max_depth | min_sa... | n_esti... |
-------------------------------------------------------------
| [39m1        [39m | [39m0.6292   [39m | [39m13.11    [39m | [39m19.11    [39m | [39m222.3    [39m |
| [35m2        [39m | [35m0.6447   [39m | [35m19.16    [39m | [35m4.808    [39m | [35m55.24    [39m |
| [39m3        [39m | [39m0.6256   [39m | [39m4.568    [39m | [39m17.59    [39m | [39m184.3    [39m |
| [35m4        [39m | [35m0.6473   [39m | [35m22.12    [39m | [35m2.371    [39m | [35m291.3    [39m |
| [35m5        [39m | [35m0.6606   [39m | [35m25.48    [39m | [35m5.822    [39m | [35m62.73    [39m |
| [35m6        [39m | [35m0.6722   [39m | [35m28.8     [39m | [35m2.515    [39m | [35m67.92    [39m |
| [35m7        [39m | [35m0.6792   [39m | [35m30.0     [39m | [35m5.573    [39m | [35m85.4     [39m |
| [39m8        [39m | [39m0.6753   [39m | [39m29.88    [39m | [39m5.917    [39m | [

In [81]:
# Initialize the model with the best parameters
rf_clf = RandomForestClassifier(n_estimators=best_params['n_estimators'], 
                                max_depth=best_params['max_depth'], 
                                min_samples_split=best_params['min_samples_split'], 
                                random_state=42)

# Train the model
rf_clf.fit(X_train, y_train)


In [93]:
# Initialize the Naive Bayes classifier
nb_clf = MultinomialNB()
nb_clf.fit(X_train,y_train)

# Combine the classifiers into a voting classifier
voting_clf = VotingClassifier(estimators=[('rf', rf_clf), ('nb', nb_clf)], voting='soft')

# Train the ensemble model
voting_clf.fit(X_train, y_train)


In [95]:
# Make predictions
y_pred = voting_clf.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print a detailed classification report
print(classification_report(y_test, y_pred))


Accuracy: 0.7523
              precision    recall  f1-score   support

          -1       0.74      0.98      0.85      1356
           0       0.72      0.32      0.44       458
           1       0.87      0.46      0.60       382

    accuracy                           0.75      2196
   macro avg       0.78      0.59      0.63      2196
weighted avg       0.76      0.75      0.72      2196



In [87]:
# Initialize the classifiers
rf_clf = RandomForestClassifier(random_state=42)
nb_clf = MultinomialNB()

# Create the VotingClassifier
voting_clf = VotingClassifier(estimators=[
    ('rf', rf_clf),
    ('nb', nb_clf)
], voting='soft')


In [76]:
# Apply GridSearchCV
grid_search = GridSearchCV(estimator=voting_clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)


NameError: name 'GridSearchCV' is not defined

In [None]:
# Best parameters found by GridSearchCV
print("Best parameters found: ", grid_search.best_params_)

# Best estimator
best_voting_clf = grid_search.best_estimator_

# Make predictions with the best estimator
y_pred = best_voting_clf.predict(X_test)

# Evaluate the best model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
import joblib

# Save the best model to a file
joblib.dump(best_voting_clf, 'best_voting_classifier_twitter_sentiment.pkl')


In [None]:
# Load the model from file
loaded_model = joblib.load('best_voting_classifier_twitter_sentiment.pkl')

# Make predictions using the loaded model
y_pred_loaded = loaded_model.predict(X_test)
