In [None]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import re
import nltk
import seaborn as sns
%matplotlib inline

In [None]:
train_data = pd.read_csv(r'F:\DataSets\Akaike Technologies\train_data.csv')
train_label = pd.read_csv(r'F:\DataSets\Akaike Technologies\train_label.csv')

In [None]:
df_test = pd.read_csv(r'F:\DataSets\Akaike Technologies\test_data.csv')

In [None]:
#Number of missing values

train_data.isnull().sum()
train_label.isnull().sum()
df_test.isnull().sum()

# Labeldata Categorization - Making dummy features & groupby by 'id'

In [None]:
train_label = pd.get_dummies(train_label)

In [None]:
train_label = train_label.groupby(['id'], as_index=False).agg('sum')

In [None]:
categories = list((train_label.drop(['id'], axis=1)).columns.values)

# Merge two dataframes: train_data & train_label

In [None]:
df_train = pd.merge(train_data, train_label, on='id')       #df_train = Final train dataframe

# Cleaning of text data

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'[?|!|\'|"|#]',r'',text)
    text = re.sub(r'[.|,|)|(|\|/]',r'',text)
    text = text.strip(' ')
    return text

In [None]:
df_train['text'] = df_train['text'].map(lambda com : clean_text(com))
df_test['text'] = df_test['text'].map(lambda com: clean_text(com))

# Split into train & test data.

df_train split into train and test data , making the variables 'X_train' & ' X_test'

In [None]:
train, test = train_test_split(df_train, random_state=42, test_size=0.20, shuffle=True)

In [None]:
X_train = train.text
X_test = test.text

# Naive Bayes

In [None]:
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])

In [None]:
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    NB_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

# Logistic regression:

In [None]:
test_result = []                                                #list to collect test_prediction list.
LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])
for category in categories:
    test_prediction = []
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    LogReg_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = LogReg_pipeline.predict(X_test)
    
    test_prediction = LogReg_pipeline.predict(df_test.text)       #Test_data prediction 
    test_result.append(test_prediction)
    
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

# My submission:

In [None]:
df_result = pd.DataFrame.from_dict(dict( zip( categories, test_result)))

In [None]:
df_result = df_result.set_index(df_test.id)

In [None]:
df_result.to_csv('C:\Users\Mangesh\Desktop\submission.csv', header=True)

# Data Visualization

In [None]:
lens = df_train.text.str.len()
lens.hist(bins = np.arange(0,6000,5))       

In [None]:
#We can see few "df.train['text']" data length is beyond 5000 characters. They can be considered as 'outliers'

df_train[df_train.text.str.len()>5000]