In [1]:
%pip install seaborn

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [2]:
%conda install -c conda-forge wordcloud

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
import warnings
from wordcloud import WordCloud
warnings.filterwarnings("ignore")

In [4]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

## Reading the data

In [5]:
# read the data
data_train = pd.read_csv('../datasets/train.csv')
data_train.head()

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
0,34552656,Java: Repeat Task Every Random Seconds,<p>I'm already familiar with repeating tasks e...,<java><repeat>,2016-01-01 00:21:59,LQ_CLOSE
1,34553034,Why are Java Optionals immutable?,<p>I'd like to understand why Java 8 Optionals...,<java><optional>,2016-01-01 02:03:20,HQ
2,34553174,Text Overlay Image with Darkened Opacity React...,<p>I am attempting to overlay a title over an ...,<javascript><image><overlay><react-native><opa...,2016-01-01 02:48:24,HQ
3,34553318,Why ternary operator in swift is so picky?,"<p>The question is very simple, but I just cou...",<swift><operators><whitespace><ternary-operato...,2016-01-01 03:30:17,HQ
4,34553755,hide/show fab with scale animation,<p>I'm using custom floatingactionmenu. I need...,<android><material-design><floating-action-but...,2016-01-01 05:21:48,HQ


In [6]:
data_valid = pd.read_csv("../datasets/valid.csv")
data_valid.head()

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
0,34552974,How to get all the child records from differen...,I am having 4 different tables like \r\nselect...,<sql><sql-server>,2016-01-01 01:44:52,LQ_EDIT
1,34554721,Retrieve all except some data of the another t...,I have two table m_master and tbl_appointment\...,<php><mysql><sql><codeigniter><mysqli>,2016-01-01 08:43:50,LQ_EDIT
2,34555135,Pandas: read_html,<p>I'm trying to extract US states from wiki U...,<python><pandas>,2016-01-01 09:55:22,HQ
3,34555448,Reader Always gimme NULL,"I'm so new to C#, I wanna make an application ...",<sql-server><c#-4.0>,2016-01-01 10:43:45,LQ_EDIT
4,34555752,php rearrange array elements based on condition,basically i have this array:\r\n\r\n array(...,<php>,2016-01-01 11:34:09,LQ_EDIT


In [7]:
train_data = data_train.copy()
valid_data = data_valid.copy()

In [8]:
nltk.download('stopwords')
STOPWORDS = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
porter = PorterStemmer()

In [27]:
# Preprocess
def preprocess(text, lower=True, stem=False,
                filters="[!\"'#$%&()*\+,-.:;<=>?@\\\[\]^_`{|}~]",
                stopwords=STOPWORDS):
    # lower the text
    if lower:
        text = text.lower()
    
    # remove the stopwords
    pattern = re.compile(r'\b(' + r"|".join(stopwords) + r")\b\s*")
    text = pattern.sub('', text)
    
    # remove <p> and </p> tags
    text = re.sub(r"[^(a-zA-Z0-9)\s]", " ", text)
    text = re.sub(r"\bp\b","", text)
    
    # spacing and filters
    # text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
    # text = re.sub(filters, r"", text)
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = text.strip()
    
    # Remove links
    text = re.sub(r"http\S+", "", text)

    # Stemming
    if stem:
        text = " ".join([porter.stem(word) for word in text.split(" ")])

    return text

In [28]:
import ipywidgets as widgets

In [29]:
train_data['Body'][0]

'<p>I\'m already familiar with repeating tasks every n seconds by using Java.util.Timer and Java.util.TimerTask. But lets say I want to print "Hello World" to the console every random seconds from 1-5. Unfortunately I\'m in a bit of a rush and don\'t have any code to show so far. Any help would be apriciated.  </p>\n'

In [30]:
# Toggle preprocessing parameters
@widgets.interact(lower=True, stem=False)
def display_preprocessed_text(lower, stem):
    text = train_data['Body'][0]
    preprocessed_text = preprocess(text=text, lower=lower, stem=stem)
    print(text)
    print (preprocessed_text)

interactive(children=(Checkbox(value=True, description='lower'), Checkbox(value=False, description='stem'), Ou…

In [32]:
# check for null values
percent_missing = train_data.isnull().sum() * 100 / len(train_data)
missing_value_df = pd.DataFrame({'column_name': train_data.columns,
                                 'percent_missing': percent_missing})
missing_value_df

Unnamed: 0,column_name,percent_missing
Id,Id,0.0
Title,Title,0.0
Body,Body,0.0
Tags,Tags,0.0
CreationDate,CreationDate,0.0
Y,Y,0.0


In [38]:
train_df = train_data[['Title','Body', 'Y']]
valid_df = valid_data[['Title','Body', 'Y']]

In [39]:
train_df['text'] = train_df['Title'] + train_df['Body']
valid_df['text'] = valid_df['Title'] + valid_df['Body']
train_df.drop(['Title', 'Body'], axis=1,inplace=True)
valid_df.drop(['Title', 'Body'], axis=1,inplace=True)

In [40]:
train_df.head()

Unnamed: 0,Y,text
0,LQ_CLOSE,Java: Repeat Task Every Random Seconds<p>I'm a...
1,HQ,Why are Java Optionals immutable?<p>I'd like t...
2,HQ,Text Overlay Image with Darkened Opacity React...
3,HQ,Why ternary operator in swift is so picky?<p>T...
4,HQ,hide/show fab with scale animation<p>I'm using...


In [41]:
train_df.text = train_df.text.apply(preprocess,lower=True, stem=False)
valid_df.text = valid_df.text.apply(preprocess,lower=True, stem=False)

In [42]:
train_df.head()

Unnamed: 0,Y,text
0,LQ_CLOSE,java repeat task every random seconds already ...
1,HQ,java optionals immutable like understand java ...
2,HQ,text overlay image darkened opacity react nati...
3,HQ,ternary operator swift picky question simple c...
4,HQ,hide show fab scale animation using custom flo...


In [43]:
# shape of data
train_df.shape

(45000, 2)

In [44]:
valid_data.shape

(15000, 6)

## EDA

### What is the distribution of each question rating?

In [45]:
from collections import Counter

In [None]:
ratings, ratings_count = zip(*Counter(train_df['text'].values).most_common(1000))
plt.figure(figsize=(19,8))
ax = sns.barplot(list(ratings), list(ratings_count))
plt.title("Ratings Distribution", fontsize=20)
plt.xlabel("Ratings", fontsize=16)
plt.ylabel("Count", fontsize=16)
ax.set_xticklabels(ratings, rotation=90, fontsize=14)
plt.show()

### Wordcloud

In [None]:
@widgets.interact(ratings=list(ratings))
def display_word_cloud(rating="HQ"):
    plt.figure(figsize=(15, 8))
    subset = train_df[train_df.Y==rating]
    text = subset.text.values
    cloud = WordCloud(
        stopwords=STOPWORDS,
        background_color="white",
        collocations=False,
        width=500, height=100,
    ).generate(" ".join(text))
    plt.axis("off")
    plt.imshow(cloud)