In [26]:
import pandas as pd

# Load the dataset
df = pd.read_csv(r'C:\Users\ayush\Downloads\Tweets.csv\Tweets.csv')

# Quick look at the data
print(df.head())
print(df.info())


             tweet_id airline_sentiment  airline_sentiment_confidence  \
0  570306133677760513           neutral                        1.0000   
1  570301130888122368          positive                        0.3486   
2  570301083672813571           neutral                        0.6837   
3  570301031407624196          negative                        1.0000   
4  570300817074462722          negative                        1.0000   

  negativereason  negativereason_confidence         airline  \
0            NaN                        NaN  Virgin America   
1            NaN                     0.0000  Virgin America   
2            NaN                        NaN  Virgin America   
3     Bad Flight                     0.7033  Virgin America   
4     Can't Tell                     1.0000  Virgin America   

  airline_sentiment_gold        name negativereason_gold  retweet_count  \
0                    NaN     cairdin                 NaN              0   
1                    NaN    jnar

In [27]:
from sklearn.impute import SimpleImputer

# Separate the numerical and categorical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# Impute missing values in numerical data with mean
num_imputer = SimpleImputer(strategy='mean')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# Impute missing values in categorical data with mode
cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# Confirm no missing values remain
print(df.isnull().sum())


tweet_id                        0
airline_sentiment               0
airline_sentiment_confidence    0
negativereason                  0
negativereason_confidence       0
airline                         0
airline_sentiment_gold          0
name                            0
negativereason_gold             0
retweet_count                   0
text                            0
tweet_coord                     0
tweet_created                   0
tweet_location                  0
user_timezone                   0
dtype: int64


In [48]:
from sklearn.preprocessing import OneHotEncoder

# One-Hot Encode categorical columns
encoder = OneHotEncoder(sparse_output=False)
encoded_cols = encoder.fit_transform(df[cat_cols])

# Create a DataFrame from the encoded columns
encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(cat_cols))

# Drop the original categorical columns and add the encoded ones
df_encoded = df.drop(cat_cols, axis=1)
df_encoded = pd.concat([df_encoded, encoded_df], axis=1)

# Display the DataFrame with encoded features
print(df_encoded.head())


       tweet_id  airline_sentiment_confidence  negativereason_confidence  \
0  5.703061e+17                        1.0000                   0.638298   
1  5.703011e+17                        0.3486                   0.000000   
2  5.703011e+17                        0.6837                   0.638298   
3  5.703010e+17                        1.0000                   0.703300   
4  5.703008e+17                        1.0000                   1.000000   

   retweet_count  airline_sentiment_negative  airline_sentiment_neutral  \
0            0.0                         0.0                        1.0   
1            0.0                         0.0                        0.0   
2            0.0                         0.0                        1.0   
3            0.0                         1.0                        0.0   
4            0.0                         1.0                        0.0   

   airline_sentiment_positive  negativereason_Bad Flight  \
0                         0.0   

In [29]:
from sklearn.preprocessing import StandardScaler

# Standardize the dataset
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_encoded)

# Convert back to DataFrame
df_scaled = pd.DataFrame(df_scaled, columns=df_encoded.columns)

print(df_scaled.head())


   tweet_id  airline_sentiment_confidence  negativereason_confidence  \
0  1.396231                      0.613122                   0.000000   
1  1.389810                     -3.387507                  -2.278628   
2  1.389749                     -1.329462                   0.000000   
3  1.389682                      0.613122                   0.232046   
4  1.389407                      0.613122                   1.291220   

   retweet_count  airline_sentiment_negative  airline_sentiment_neutral  \
0      -0.110828                   -1.296278                   1.929794   
1      -0.110828                   -1.296278                  -0.518190   
2      -0.110828                   -1.296278                   1.929794   
3      -0.110828                    0.771439                  -0.518190   
4      -0.110828                    0.771439                  -0.518190   

   airline_sentiment_positive  negativereason_Bad Flight  \
0                   -0.438718                  -0.203105

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

# Assuming the text data is in a column named 'text'
vectorizer = CountVectorizer(max_features=5000)
text_features = vectorizer.fit_transform(df['text']).toarray()

# Convert to DataFrame
text_features_df = pd.DataFrame(text_features, columns=vectorizer.get_feature_names_out())

# Add to the main dataset
df_final = pd.concat([df_scaled, text_features_df], axis=1)

print(df_final.head())


   tweet_id  airline_sentiment_confidence  negativereason_confidence  \
0  1.396231                      0.613122                   0.000000   
1  1.389810                     -3.387507                  -2.278628   
2  1.389749                     -1.329462                   0.000000   
3  1.389682                      0.613122                   0.232046   
4  1.389407                      0.613122                   1.291220   

   retweet_count  airline_sentiment_negative  airline_sentiment_neutral  \
0      -0.110828                   -1.296278                   1.929794   
1      -0.110828                   -1.296278                  -0.518190   
2      -0.110828                   -1.296278                   1.929794   
3      -0.110828                    0.771439                  -0.518190   
4      -0.110828                    0.771439                  -0.518190   

   airline_sentiment_positive  negativereason_Bad Flight  \
0                   -0.438718                  -0.203105

In [31]:
from sklearn.model_selection import train_test_split

# Assuming the target variable is 'airline_sentiment'
X = df_final.drop('airline_sentiment', axis=1)
y = df['airline_sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(11712, 41411) (2928, 41411) (11712,) (2928,)


In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train the model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict on the test data
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(report)


Accuracy: 0.9982923497267759
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00      1889
     neutral       1.00      0.99      1.00       580
    positive       1.00      1.00      1.00       459

    accuracy                           1.00      2928
   macro avg       1.00      1.00      1.00      2928
weighted avg       1.00      1.00      1.00      2928



In [40]:
from skopt import BayesSearchCV
from sklearn.naive_bayes import MultinomialNB

# Define the model and parameters
bayes_opt = BayesSearchCV(estimator=RandomForestClassifier(random_state=42),
                          search_spaces={
                              'n_estimators': (10, 500),
                              'max_depth': (2, 50),
                              'min_samples_split': (2, 10),
                              'min_samples_leaf': (1, 10),
                          }
                          n_iter=20,
                          cv=3,
                          random_state=42)


In [42]:
#fit the model
bayes_opt.fit(X_train, y_train)

# Predict and evaluate
y_pred_optimized = bayes_opt.predict(X_test)
accuracy_optimized = accuracy_score(y_test, y_pred_optimized)

print(f"Optimized Accuracy: {accuracy_optimized}")


KeyboardInterrupt: 

In [44]:
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB

# Instantiate models
nb = MultinomialNB()
rf = bayes_opt.best_estimator_

# Combine models into a Voting Classifier
voting_clf = VotingClassifier(estimators=[('rf', rf), ('nb', nb)], voting='soft')
voting_clf.fit(X_train, y_train)


AttributeError: 'BayesSearchCV' object has no attribute 'best_estimator_'

In [None]:
# Final prediction and evaluation
final_pred = voting_clf.predict(X_test)
final_accuracy = accuracy_score(y_test, final_pred)

print(f"Final Accuracy: {final_accuracy}")
