In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import re
import nltk
nltk.download('popular')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from wordcloud import WordCloud
from bs4 import BeautifulSoup

%matplotlib inline
random.seed(42)

In [None]:
questions_data = pd.read_csv('Questions^.csv')
tags_data = pd.read_csv('Tags^.csv')

In [None]:
questions_data.head()

In [None]:
tags_data.head()

In [None]:
questions_data.info()

In [None]:
tags_data.info()

In [None]:
tags_residual = tags_data[tags_data.isna().any(axis=1)]
tags_residual.head()

In [None]:
question_tags_data = pd.merge(questions_data,tags_data,how='inner',on='Id')

In [None]:
question_tags_data.head()

In [None]:
question_tags_data.info()

### Dropping na

In [None]:
question_tags_data.drop(['OwnerUserId','OwnerUserId','Id','CreationDate','Score'],axis=1,inplace=True)
question_tags_data.columns = question_tags_data.columns.str.strip()
question_tags_data.columns = question_tags_data.columns.str.replace(' ', '_')
question_tags_data.columns = question_tags_data.columns.str.lower()

In [None]:
question_tags_data.head()

In [None]:
question_tags_data.info()

In [None]:
question_tags_data[question_tags_data.isna().any(axis=1)].head()

### Text Cleaning

In [None]:
question_tags_data.dropna(axis=0,inplace=True)
question_tags_data = question_tags_data.reset_index(drop=True)

In [None]:
question_tags_data.info()

In [None]:
plt.figure(figsize=(20,3))
count_plot = sns.barplot(x=question_tags_data.tag.value_counts().nlargest(20).index, y=question_tags_data.tag.value_counts().nlargest(20))
count_plot.set_xticklabels(count_plot.get_xticklabels(), rotation=45)

In [None]:
question_tags_data_grouped = question_tags_data.groupby('tag').head(500).reset_index(drop=True)

In [None]:
value_counts = question_tags_data_grouped.tag.value_counts()
to_remove = value_counts[value_counts<100].index
question_tags_data_grouped = question_tags_data_grouped[~question_tags_data_grouped.tag.isin(to_remove)]
question_tags_data_grouped = question_tags_data_grouped.reset_index(drop=True)

In [None]:
print(question_tags_data_grouped.shape)
print(len(question_tags_data_grouped.tag.unique()))
question_tags_data_grouped.head()

In [None]:
body = []
codes = []
for i in range(question_tags_data_grouped.shape[0]):
    soup = BeautifulSoup(question_tags_data_grouped.body[i])
    to_remove = soup.find_all('code')
    for element in to_remove:
        code = element.extract()
    body.append(soup)
    to_remove = list(str(to_remove))
    code = ''.join(to_remove)
    codes.append(code)
question_tags_data_grouped['body'] = body
question_tags_data_grouped['code'] = codes

In [None]:
question_tags_data_grouped = question_tags_data_grouped.applymap(lambda text: BeautifulSoup(str(text), 'html.parser').get_text())

In [None]:
question_tags_data_grouped.head()

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub(r"\'\n", " ", text)
    text = re.sub(r"\'\xa0", " ", text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [None]:
question_tags_data_grouped['title'] = question_tags_data_grouped['title'].apply(lambda x: clean_text(x)) 
question_tags_data_grouped['body'] = question_tags_data_grouped['body'].apply(lambda x: clean_text(x)) 
question_tags_data_grouped['code'] = question_tags_data_grouped['code'].apply(lambda x: clean_text(x)) 

In [None]:
stop=set(stopwords.words('english'))
question_tags_data_grouped['title_without_stopwords'] = question_tags_data_grouped['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
question_tags_data_grouped['body_without_stopwords'] = question_tags_data_grouped['body'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
question_tags_data_grouped['code_without_stopwords'] = question_tags_data_grouped['code'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
#Lemmatizing
lemmatizer = WordNetLemmatizer()
def lemmatize_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(stemmed_tokens)

In [None]:
question_tags_data_grouped['title_without_stopwords'] = question_tags_data_grouped['title_without_stopwords'].apply(lemmatize_sentences)
question_tags_data_grouped['body_without_stopwords'] = question_tags_data_grouped['body_without_stopwords'].apply(lemmatize_sentences)
question_tags_data_grouped['code_without_stopwords'] = question_tags_data_grouped['code_without_stopwords'].apply(lemmatize_sentences)

In [None]:
print(question_tags_data_grouped['body_without_stopwords'][0])
print(question_tags_data_grouped['title_without_stopwords'][0])
print(question_tags_data_grouped['code_without_stopwords'][0])

In [None]:
question_tags_data_grouped['title']=question_tags_data_grouped['title_without_stopwords']
question_tags_data_grouped.drop(['title_without_stopwords'],axis=1,inplace=True)
question_tags_data_grouped['body']=question_tags_data_grouped['body_without_stopwords']
question_tags_data_grouped.drop(['body_without_stopwords'],axis=1,inplace=True)
question_tags_data_grouped['code']=question_tags_data_grouped['code_without_stopwords']
question_tags_data_grouped.drop(['code_without_stopwords'],axis=1,inplace=True)
question_tags_data_grouped.to_csv('modified_question_and_body.csv',sep = ',')

In [None]:
question_tags_data_grouped_2 = question_tags_data_grouped.drop_duplicates(subset='title', keep='first')
question_tags_data_grouped_2.groupby('title').head(10)
print(question_tags_data_grouped.shape, question_tags_data_grouped_2.shape)

In [None]:
question_tags_data_grouped_2 = question_tags_data_grouped.drop_duplicates(subset='title', keep='first')
question_tags_data_grouped_2.groupby('title').head(10)
print(question_tags_data_grouped.shape, question_tags_data_grouped_2.shape)
