**Importing Libraries**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

from scipy.sparse import hstack, csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


**Importing Dataset**

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

**Data Preprocessing - Train.csv**

In [3]:
train.head()

Unnamed: 0,id,tid1,tid2,title1_en,title2_en,label
0,195611,0,1,There are two new old-age insurance benefits f...,"Police disprove ""bird's nest congress each per...",unrelated
1,191474,2,3,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP outstrips Hong Kong? Shenzhen S...,unrelated
2,25300,2,4,"""If you do not come to Shenzhen, sooner or lat...",The GDP overtopped Hong Kong? Shenzhen clarifi...,unrelated
3,123757,2,8,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP overtakes Hong Kong? Bureau of ...,unrelated
4,141761,2,11,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP outpaces Hong Kong? Defending R...,unrelated


In [4]:
train.columns

Index(['id', 'tid1', 'tid2', 'title1_en', 'title2_en', 'label'], dtype='object')

In [5]:
train.nunique()

id           256442
tid1          63783
tid2         124207
title1_en     62697
title2_en    122492
label             3
dtype: int64

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akshaysinghim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
#checking for null values
test.isnull().sum()

id           0
tid1         0
tid2         0
title1_en    0
title2_en    0
dtype: int64

In [8]:
#checking for null values
train.isnull().sum()

id           0
tid1         0
tid2         0
title1_en    0
title2_en    0
label        0
dtype: int64

**Stemming Process**

Through this process, we are taking out all the extra words, example - is, am, are, then, they etc. To see a complete list of these words, 
use this command - #


In [9]:
#print(stopwords.words('english'))

In [10]:
port_stem = PorterStemmer()

In [11]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [None]:
train['title1_en'] = train['title1_en'].apply(stemming)

In [None]:
train['title2_en'] = train['title2_en'].apply(stemming)

In [None]:
train.head(2)

**Separating Target Variable**

In [None]:
X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1]

In [None]:
X_train.head(2)

In [None]:
y_train.head(2)

In [None]:
#converting text data to numerical data
vectorizer = TfidfVectorizer()

# Fit the vectorizer on the training data
vectorizer.fit(pd.concat([X_train['title1_en'], X_train['title2_en']]))

title1_tfidf = vectorizer.transform(X_train['title1_en'])
title2_tfidf = vectorizer.transform(X_train['title2_en'])

#title1_tfidf = vectorizer.fit_transform(X_train['title1_en'])
#title2_tfidf = vectorizer.fit_transform(X_train['title2_en'])



In [None]:
combined_sparse_matrix = hstack([title1_tfidf, title2_tfidf])

**Splitting Data into Test and Training**

In [None]:
# Split the data into train and test sets
X = combined_sparse_matrix
y = y_train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)


**Multinomial Logistic Regression**

In [None]:
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=10)

In [None]:
# Fit the model to the training data
model.fit(X_train, y_train)

In [None]:
# Make predictions on the test data
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
print(classification_report(y_test, y_pred))

***Accuracy 0.80***

**Random Forest Classifier** 

In [None]:
#This will take some time to run completely, be patient. 

In [None]:
rf_model = RandomForestClassifier(n_estimators=50, random_state=10)
rf_model.fit(X_train, y_train)


In [None]:
y_pred_rf = rf_model.predict(X_test)
print(classification_report(y_test, y_pred_rf))


***Accuracy 0.86***

**Decision Tree Model**

In [None]:
dt_model = DecisionTreeClassifier(random_state=10)
dt_model.fit(X_train, y_train)


In [None]:
y_pred_dt = dt_model.predict(X_test)
print(classification_report(y_test, y_pred_dt))


***Accuracy 0.80***

Since Random Forest produced the highest accuracy, we will use that model to classify labels on our test.csv dataset.

**Data Preprocessing - Test.csv**

In [None]:
test.head()

**Stemming**

In [None]:
test['title1_en'] = test['title1_en'].apply(stemming)
test['title2_en'] = test['title2_en'].apply(stemming)

In [None]:
# converting the textual data to numerical data

title1_test = vectorizer.transform(test['title1_en'])
title2_test = vectorizer.transform(test['title2_en'])

In [None]:
combined_test_matrix = hstack([title1_test, title2_test])

Now putting the test data through the rf model. 

**Making Predictions on the Test Dataset**

In [None]:
y_test_pred_rf = rf_model.predict(combined_test_matrix)


**Saving the result**

In [None]:
submission_dt = test[['id']].copy()
submission_dt['label'] = y_test_pred_rf
submission_dt.to_csv('submission.csv', index=False)