In [None]:
#Data Analysis
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Data Preprocessing and Feature Engineering
from textblob import TextBlob
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

#Model Selection and Validation
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

In [None]:
train_tweets = pd.read_csv('/content/train.csv')
test_tweets = pd.read_csv('/content/test_with_no_labels.csv')

In [None]:
def form_sentence(tweet):
    tweet_blob = TextBlob(tweet)
    return ' '.join(tweet_blob.words)

def no_user_alpha(tweet):
    tweet_list = [ele for ele in tweet.split() if ele != 'user']
    clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
    clean_s = ' '.join(clean_tokens)
    clean_mess = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')]
    return clean_mess

def normalization(tweet_list):
        lem = WordNetLemmatizer()
        normalized_tweet = []
        for word in tweet_list:
            normalized_text = lem.lemmatize(word,'v')
            normalized_tweet.append(normalized_text)
        return normalized_tweet

In [None]:


def wrangle(df,post_column):
    df['tweet_list'] = df[post_column].apply(no_user_alpha(form_sentence()))
    df['normalized_tweet'] = df['tweet_list'].apply(normalization())

    return df


In [None]:
pipeline = Pipeline([
    ('bow',CountVectorizer(stop_words='english', 
                             min_df=2, 
                             max_df=0.5, 
                             ngram_range=(1, 1))),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [None]:
msg_train, msg_test, label_train, label_test = train_test_split(train_tweets['message'], train_tweets['sentiment'], test_size=0.2)
pipeline.fit(msg_train,label_train)
predictions = pipeline.predict(msg_test)
print(classification_report(predictions,label_test))
print(confusion_matrix(predictions,label_test))
print(accuracy_score(predictions,label_test))

              precision    recall  f1-score   support

          -1       0.09      0.93      0.17        27
           0       0.10      0.82      0.18        62
           1       0.95      0.64      0.76      2577
           2       0.57      0.78      0.66       498

    accuracy                           0.66      3164
   macro avg       0.43      0.79      0.44      3164
weighted avg       0.87      0.66      0.73      3164

[[  25    1    0    1]
 [   5   51    6    0]
 [ 221  424 1639  293]
 [  15   21   73  389]]
0.6649810366624526


### Trying with other models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC


In [None]:
classifiers = [LogisticRegression,SGDClassifier,RandomForestClassifier,XGBClassifier,AdaBoostClassifier,SVC]
predictions_list = []
models_list =[]
for classifier in classifiers:
  pipeline = Pipeline([
    ('bow',CountVectorizer(stop_words='english', 
                             min_df=2, 
                             max_df=0.5, 
                             ngram_range=(1, 1))),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', classifier()),  # train on TF-IDF vectors 
  ])
  msg_train, msg_test, label_train, label_test = train_test_split(train_tweets['message'], train_tweets['sentiment'], test_size=0.2)
  pipeline.fit(msg_train,label_train)
  predictions = pipeline.predict(msg_test)
  models_list.append(pipeline)
  predictions_list.append(predictions)
  print('=========================================================')
  print(classifier)
  print(classification_report(predictions,label_test))
  print(confusion_matrix(predictions,label_test))
  print(accuracy_score(predictions,label_test))
  print(" ")

<class 'sklearn.linear_model._logistic.LogisticRegression'>
              precision    recall  f1-score   support

          -1       0.31      0.85      0.45        88
           0       0.32      0.65      0.43       249
           1       0.90      0.69      0.79      2176
           2       0.67      0.75      0.71       651

    accuracy                           0.71      3164
   macro avg       0.55      0.74      0.59      3164
weighted avg       0.79      0.71      0.73      3164

[[  75    3    7    3]
 [  30  162   44   13]
 [ 120  317 1511  228]
 [  20   31  111  489]]
0.7070164348925411
 
<class 'sklearn.linear_model._stochastic_gradient.SGDClassifier'>
              precision    recall  f1-score   support

          -1       0.38      0.82      0.52       124
           0       0.35      0.63      0.45       265
           1       0.89      0.73      0.81      2047
           2       0.73      0.74      0.74       728

    accuracy                           0.73      3164

In [None]:
len(predictions_list[0])

3164

In [None]:
len(predictions_list[1])
models_list

[Pipeline(steps=[('bow',
                  CountVectorizer(max_df=0.5, min_df=2, stop_words='english')),
                 ('tfidf', TfidfTransformer()),
                 ('classifier', LogisticRegression())]),
 Pipeline(steps=[('bow',
                  CountVectorizer(max_df=0.5, min_df=2, stop_words='english')),
                 ('tfidf', TfidfTransformer()),
                 ('classifier', SGDClassifier())]),
 Pipeline(steps=[('bow',
                  CountVectorizer(max_df=0.5, min_df=2, stop_words='english')),
                 ('tfidf', TfidfTransformer()),
                 ('classifier', RandomForestClassifier())]),
 Pipeline(steps=[('bow',
                  CountVectorizer(max_df=0.5, min_df=2, stop_words='english')),
                 ('tfidf', TfidfTransformer()),
                 ('classifier', XGBClassifier(objective='multi:softprob'))]),
 Pipeline(steps=[('bow',
                  CountVectorizer(max_df=0.5, min_df=2, stop_words='english')),
                 ('tfidf', TfidfTra

## Import and predict on the test data

In [None]:
df_test = pd.read_csv('/content/test_with_no_labels.csv')

In [None]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10546 entries, 0 to 10545
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   message  10546 non-null  object
 1   tweetid  10546 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 164.9+ KB


In [None]:
df_test.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


In [None]:
df_test['pred_1'] = models_list[0].predict(df_test['message'])

In [None]:
df_test.head()

Unnamed: 0,message,tweetid,pred_1
0,Europe will now be looking to China to make su...,169760,1
1,Combine this with the polling of staffers re c...,35326,1
2,"The scary, unimpeachable evidence that climate...",224985,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0


In [None]:
df_test['pred_2'] = models_list[2].predict(df_test['message'])

In [None]:
df_test.head()

Unnamed: 0,message,tweetid,pred_1,pred_2
0,Europe will now be looking to China to make su...,169760,1,1
1,Combine this with the polling of staffers re c...,35326,1,1
2,"The scary, unimpeachable evidence that climate...",224985,1,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0,0


In [None]:
df_test['pred_3'] = models_list[3].predict(df_test['message'])
df_test.head()

Unnamed: 0,message,tweetid,pred_1,pred_2,pred_3
0,Europe will now be looking to China to make su...,169760,1,1,1
1,Combine this with the polling of staffers re c...,35326,1,1,1
2,"The scary, unimpeachable evidence that climate...",224985,1,1,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1,1,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0,0,1


In [None]:
df_test['pred_4'] = models_list[1].predict(df_test['message'])
df_test.head()

Unnamed: 0,message,tweetid,pred_1,pred_2,pred_3,pred_4
0,Europe will now be looking to China to make su...,169760,1,1,1,1
1,Combine this with the polling of staffers re c...,35326,1,1,1,1
2,"The scary, unimpeachable evidence that climate...",224985,1,1,1,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1,1,1,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0,0,1,1


In [None]:
df_test['pred_5'] = models_list[5].predict(df_test['message'])
df_test.head()

Unnamed: 0,message,tweetid,pred_1,pred_2,pred_3,pred_4,pred_5
0,Europe will now be looking to China to make su...,169760,1,1,1,1,1
1,Combine this with the polling of staffers re c...,35326,1,1,1,1,1
2,"The scary, unimpeachable evidence that climate...",224985,1,1,1,1,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1,1,1,1,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0,0,1,1,1


In [None]:
models_list[5]

Pipeline(steps=[('bow',
                 CountVectorizer(max_df=0.5, min_df=2, stop_words='english')),
                ('tfidf', TfidfTransformer()), ('classifier', SVC())])

## Prepare submissions for kaggle in form of csv files

In [None]:
#df.to_csv('file_name.csv')

In [None]:
kaggle_df_1 = df_test[['tweetid','pred_4']]

In [None]:
kaggle_df_1.head()

Unnamed: 0,tweetid,pred_4
0,169760,1
1,35326,1
2,224985,1
3,476263,1
4,872928,0


In [None]:
kaggle_df_1.set_index('tweetid', inplace=True)
kaggle_df_1.rename(columns={'pred_4':'sentiment'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:

kaggle_df_1.head()

Unnamed: 0_level_0,sentiment
tweetid,Unnamed: 1_level_1
169760,1
35326,1
224985,1
476263,1
872928,0


In [None]:
kaggle_df_1.to_csv('first_submission.csv')

In [None]:
pipeline_2 = Pipeline([
    ('bow',CountVectorizer(stop_words='english', 
                             min_df=2, 
                             max_df=0.5, 
                             ngram_range=(1, 1))),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
  
  ])

In [None]:
msg_train2, msg_test2, label_train2, label_test2 = train_test_split(train_tweets['message'], train_tweets['sentiment'], test_size=0.2)
pipeline_2.fit(msg_train2)

Pipeline(steps=[('bow',
                 CountVectorizer(max_df=0.5, min_df=2, stop_words='english')),
                ('tfidf', TfidfTransformer())])

### Focusing and tunning the SGD CLASSIFIER

In [None]:
pipeline_2.transform(msg_train2)

<12655x7802 sparse matrix of type '<class 'numpy.float64'>'
	with 98398 stored elements in Compressed Sparse Row format>

In [None]:
pip install parfit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from sklearn.model_selection import ParameterGrid
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score
import parfit.parfit as pf

grid = {
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], # learning rate
    'n_iter': [1000], # number of epochs
    'loss': ['log'], # logistic regression,
    'penalty': ['l2'],
    'n_jobs': [-1]
}
paramGrid = ParameterGrid(grid)

bestModel, bestScore, allModels, allScores = pf.bestFit(SGDClassifier, paramGrid,
           msg_train2, label_train2, msg_test2, label_test2, 
           metric = roc_auc_score# bestScore='max' scoreLabel = "AUC",
          )

print(bestModel)

-------------FITTING MODELS-------------


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.5s


TypeError: ignored

In [None]:
#!pip install scikit-learn==0.24.2   

In [None]:
pipeline_2.transform(msg_train2)

In [None]:
pipeline_2.transform(msg_test2)

In [None]:
#pip install git+https://github.com/hyperopt/hyperopt-sklearn.git

In [None]:

# automatic svm hyperparameter tuning using skopt for the ionosphere dataset
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.model_selection import RepeatedStratifiedKFold
from skopt import BayesSearchCV
# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/ionosphere.csv'
dataframe = read_csv(url, header=None)
# split into input and output elements
data = dataframe.values
X, y = data[:, :-1], data[:, -1]
print(X.shape, y.shape)
# define search space
params = dict()
params['C'] = (1e-6, 100.0, 'log-uniform')
params['gamma'] = (1e-6, 100.0, 'log-uniform')
params['degree'] = (1,5)
params['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define the search
search = BayesSearchCV(estimator=SVC(), search_spaces=params, n_jobs=-1, cv=cv)
# perform the search
search.fit(X, y)
# report the best result
print(search.best_score_)
print(search.best_params_)

ModuleNotFoundError: ignored

## second work

In [None]:
kaggle_df_2 = df_test[['tweetid','pred_5']]

In [None]:
kaggle_df_2.head()

Unnamed: 0,tweetid,pred_5
0,169760,1
1,35326,1
2,224985,1
3,476263,1
4,872928,1


In [None]:
kaggle_df_2.rename(columns ={'pred_5':'sentiment'}, inplace=True)
kaggle_df_2.set_index('tweetid', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
kaggle_df_2.head()

Unnamed: 0_level_0,sentiment
tweetid,Unnamed: 1_level_1
169760,1
35326,1
224985,1
476263,1
872928,1


In [None]:
kaggle_df_2.to_csv('second_submission.csv')