In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
df_train=pd.read_csv('train_cleaned.csv')
df_test=pd.read_csv('test_cleaned.csv')

In [3]:
punc = list(string.punctuation)
stop_word = stopwords.words('english')
lemma = WordNetLemmatizer()

### Applying lemmatization
   meeting    --->   meet               
   was        --->    be                
   mice       --->   mouse  

In [4]:
def preprocessing(data):
    data = re.sub('[^a-zA-Z]', ' ', data)
    data = data.lower()
    data = data.split()
    

    data = ' '.join([lemma.lemmatize(word) for word in data if (word not in punc) and (word not in stop_word)])
    return data

In [5]:
df_train['Final_text'] = df_train['text'].apply(preprocessing)
df_test['Final_text'] = df_test['text'].apply(preprocessing)

In [6]:
df_train.head()

Unnamed: 0,ID,title,text,subject,class,Final_text
0,0,#AfterTrumpImplodes Hashtag Hilariously Imagi...,What will the world be like post-Donald Trump?...,News,0.0,world like post donald trump new hashtag after...
1,1,#BlackLivesMatter Leader To Run For Mayor Of ...,The police shooting of black teen Michael Brow...,News,0.0,police shooting black teen michael brown hand ...
2,2,#BringBackObama Hashtag Blows Up On Twitter A...,The six months since President Donald Trump wa...,News,0.0,six month since president donald trump inaugur...
3,3,#FreeChrisChristie: Twitter Reacts To The ‘Ho...,"Last Friday, New Jersey Governor Chris Christi...",News,0.0,last friday new jersey governor chris christie...
4,4,#MakeAmericaBrannigan: Futurama Voice Actor R...,"The incredibly talented voice actor, Billy Wes...",News,0.0,incredibly talented voice actor billy west pro...


### separating the features (Text) and (class)

In [7]:
def drop_col(data):
    data = data.drop(columns=['ID', 'subject', 'title', 'text'], axis=1)
    data = data.dropna()
    return data

df_train = drop_col(df_train)
df_test = drop_col(df_test)

df_train.head(3)


Unnamed: 0,class,Final_text
0,0.0,world like post donald trump new hashtag after...
1,0.0,police shooting black teen michael brown hand ...
2,0.0,six month since president donald trump inaugur...


In [8]:
df_test.head(3)

Unnamed: 0,Final_text
0,new york reuters u judge thursday repeatedly p...
1,greenbelt md reuters u judge monday questioned...
2,san francisco reuters u judge monday appeared ...


In [9]:
X = df_train['Final_text'].values
y = df_train['class'].values

 ### Converting Text Data to Numerical Representation

In [10]:
vectorizer = TfidfVectorizer(max_features=9000)
vectorizer.fit(X)

X = vectorizer.transform(X)
X.shape

(40414, 9000)

In [11]:
model=RandomForestClassifier(n_estimators=100, random_state=42)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
from sklearn.model_selection import GridSearchCV

param_grid = [{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'n_estimators': [1, 11, 21, 31, 41, 51, 61, 71, 81, 91]}]

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best parameters:", best_params)
print("Best score:", best_score)

In [14]:
clf.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
print(clf.best_params_)
print(clf.best_score_)

In [14]:
model.fit(X_train, y_train)

In [15]:
y_pred = model.predict(X_test)

In [16]:
accuracy_score(y_test, y_pred)

0.9975256711616974

In [17]:
X_test = df_test['Final_text'].values

In [18]:
X_test = vectorizer.transform(X_test)

In [19]:
y_pred = model.predict(X_test)

In [20]:
df_test['class'] = y_pred

In [26]:
df_test.head()

Unnamed: 0,Final_text,class
0,1,1.0
1,2,1.0
2,3,1.0
3,4,1.0
4,5,1.0


In [28]:
df_test.to_csv('submission.csv', index=False)

In [25]:
df_test['Final_text'] = df_test.index+1