In [None]:
import pandas as pd
train = pd.read_csv('/content/sample_data/train.csv')
print("Training Set:"% train.columns, train.shape, len(train))
test = pd.read_csv('/content/sample_data/test.csv')
print("Test Set:"% test.columns, test.shape, len(test))

Training Set: (31962, 3) 31962
Test Set: (17197, 2) 17197


### **DATA** **CLEANING**

In [None]:
import re
def clean_text(df,text_field):
  df[text_field] = df[text_field].str.lower()
  pattern = r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"
  df[text_field] = df[text_field].apply(lambda elem: re.sub(pattern, "", elem))
  return df
test_clean = clean_text(test, "tweet")
train_clean = clean_text(train, "tweet")


### **HANDLING IMBALANCED DATA**

In [None]:
from sklearn.utils import resample
train_majority = train_clean[train_clean.label==0]
train_minority = train_clean[train_clean.label==1]
train_minority_upsampled = resample(train_minority,replace=True,n_samples=len(train_majority),random_state=123)
train_upsampled = pd.concat([train_minority_upsampled , train_majority])
train_upsampled['label'].value_counts()

1    29720
0    29720
Name: label, dtype: int64

### **CREATING A PIPELINE**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
pipeline = Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('nb',SGDClassifier())])

### **TRAINING** **THE** **MODEL** 

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(train_upsampled['tweet'],train_upsampled['label'],random_state=0,test_size=0.2)
model = pipeline.fit(X_train,y_train)
y_predict = model.predict(X_test)
from sklearn.metrics import f1_score
f1_score(y_test, y_predict)

0.9713617767387493