### Loading Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from string import digits 
import string
from textblob import TextBlob


### Loading Datasets

In [124]:
test = pd.read_csv("../data/test_ZUT1mqB.csv")
train = pd.read_csv("../data/train_3.csv")

In [125]:
train.head()

Unnamed: 0,ID,Title,Domain
0,1,"What is good in a decision tree, a large or a ...",Techniques
1,2,Training data only contains single positive label,Techniques
2,3,Calculating percentage contribution of a negat...,Techniques
3,4,Unable to open solution checker!,Hackathons
4,5,User Name Change,Misc


#### Checking Counts

Dataset contains null values. 

In [126]:
train.count()

ID        3845
Title     3834
Domain    3845
dtype: int64

In [127]:
test.count()

ID       1649
Title    1649
dtype: int64

#### Checking null value counts

In [128]:
train.isna().sum() ##missing fields in gender, married, self employed (var) and dependents, loanamount, loan amount term, credit history

ID         0
Title     11
Domain     0
dtype: int64

In [129]:
test.isna().sum()

ID       0
Title    0
dtype: int64

**Checking proportion of null values** 

Self Employed and Credit History have more than 5% data missing. Imputing a mean or mode value can create bias in the value. We can either create a new category of unknown or impute based on subgroup

In [130]:
(100*train.isna().sum())/train.count() ##less than 0.3% missing fields in title, can drop or can impute mode of domain or doman it self

ID        0.000000
Title     0.286907
Domain    0.000000
dtype: float64

In [131]:
(100*test.isna().sum())/test.count() #no missing field

ID       0.0
Title    0.0
dtype: float64

In [132]:
train.Domain.value_counts()

Techniques    1852
Tools          917
Career         438
Hackathons     264
Resources      173
Other          125
Misc            76
Name: Domain, dtype: int64

In [133]:
##checking what domains have missing title

train[train.Title.isna()].Domain.value_counts()

Resources     3
Other         3
Misc          2
Hackathons    2
Career        1
Name: Domain, dtype: int64

In [134]:
train.dropna(axis=0,inplace = True)

In [135]:
train.isna().sum()

ID        0
Title     0
Domain    0
dtype: int64

In [136]:
train.count()

ID        3834
Title     3834
Domain    3834
dtype: int64

### Extract Keywords and Features from Variables

In [137]:
## Saving the orignal text column before modifications
train['original_title'] = train['Title']
test['original_title'] = test['Title']

In [138]:
text_cols = test.select_dtypes(exclude=np.number).columns

In [139]:
## Changing colums to lowercase
for col in text_cols:
    train[col] = train.apply(lambda x: x[col].lower(),axis=1)
    test[col] = test.apply(lambda x: x[col].lower(),axis=1)

In [140]:
## removing punctuations
for col in text_cols:
    train[col] = train.apply(lambda x: x[col].translate(str.maketrans('','',string.punctuation)),axis=1)
    test[col] = test.apply(lambda x: x[col].translate(str.maketrans('','',string.punctuation)),axis=1)    

In [141]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /home/apurvasij/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/apurvasij/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/apurvasij/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [142]:
## Word tokenization
train['text_words'] = train.apply(lambda x: word_tokenize(x['Title']),axis=1)
test['text_words'] = test.apply(lambda x: word_tokenize(x['Title']),axis=1)

In [143]:
## Create new features 
## Number of digits ( to see if citations are made)

def cnt_digits(sentence):
    return sum(c.isdigit() for c in sentence)
        
train['text_digit_cnt'] = train.apply(lambda x: cnt_digits(x.text_words),axis=1)
test['text_digit_cnt'] = test.apply(lambda x: cnt_digits(x.text_words),axis=1)

In [144]:
## removing digits
for col in text_cols:
    train[col] = train.apply(lambda x: x[col].translate(str.maketrans('', '', digits)) ,axis=1)
    test[col] = test.apply(lambda x: x[col].translate(str.maketrans('', '', digits) ) ,axis=1)

In [145]:
## Word tokenization
train['text_words'] = train.apply(lambda x: word_tokenize(x['Title']),axis=1)
test['text_words'] = test.apply(lambda x: word_tokenize(x['Title']),axis=1)

In [146]:
## lemmatize verbs and nouns
## in final model only nouns have been lemmatize, verbs used to indicate polarity

def lemmat_words(words,pos):
    lemmated = [lemmatizer.lemmatize(word,pos) for word in words]
    return lemmated

lemmatizer = WordNetLemmatizer()

In [147]:
train['clean_text_lem'] = train.apply(lambda x: lemmat_words(x['text_words'],pos = wordnet.NOUN),axis=1)
test['clean_text_lem'] = test.apply(lambda x: lemmat_words(x['text_words'],pos = wordnet.NOUN),axis=1)

In [148]:
##remove stopwords
nltk.download('stopwords')

## stop words from nltk module
stop_words = list(set(stopwords.words('english')))

def rm_stopwords(text_words):
    stopped = [w for w in text_words if not w in stop_words]
    return stopped

train['clean_text_wrds'] = train.apply(lambda x: rm_stopwords(x['clean_text_lem']),axis=1)
test['clean_text_wrds'] = test.apply(lambda x: rm_stopwords(x['clean_text_lem']),axis=1)

train['clean_text'] = train.apply(lambda x: ' '.join(x['clean_text_wrds']),axis=1)
test['clean_text'] = test.apply(lambda x: ' '.join(x['clean_text_wrds']),axis=1)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/apurvasij/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [152]:
## Calculate Sentiment
def polarity(words):
    text = ' '.join([w for w in words])
    blob = TextBlob(text)
    return(blob.polarity)

train['title_polarity'] = train.apply(lambda x: polarity(x['text_words']),axis=1)
train['text_polarity'] = train.apply(lambda x: polarity(x['clean_text_wrds']),axis=1)
test['title_polarity'] = test.apply(lambda x: polarity(x['text_words']),axis=1)
test['text_polarity'] = test.apply(lambda x: polarity(x['clean_text_wrds']),axis=1)

In [153]:
## length of article
train['ttl_wrds'] = train.apply(lambda x: len(x.text_words),axis = 1)
test['ttl_wrds'] = test.apply(lambda x: len(x.text_words),axis = 1)

In [154]:
train.head()

Unnamed: 0,ID,Title,Domain,original_title,text_words,text_digit_cnt,clean_text_lem,clean_text_wrds,clean_text,title_polarity,text_polarity,ttl_wrds
0,1,what is good in a decision tree a large or a s...,Techniques,what is good in a decision tree a large or a s...,"[what, is, good, in, a, decision, tree, a, lar...",0,"[what, is, good, in, a, decision, tree, a, lar...","[good, decision, tree, large, small, leaf, size]",good decision tree large small leaf size,0.221429,0.221429,14
1,2,training data only contains single positive label,Techniques,training data only contains single positive label,"[training, data, only, contains, single, posit...",0,"[training, data, only, contains, single, posit...","[training, data, contains, single, positive, l...",training data contains single positive label,0.051948,0.077922,7
2,3,calculating percentage contribution of a negat...,Techniques,calculating percentage contribution of a negat...,"[calculating, percentage, contribution, of, a,...",0,"[calculating, percentage, contribution, of, a,...","[calculating, percentage, contribution, negati...",calculating percentage contribution negative c...,-0.3,-0.3,7
3,4,unable to open solution checker,Hackathons,unable to open solution checker,"[unable, to, open, solution, checker]",0,"[unable, to, open, solution, checker]","[unable, open, solution, checker]",unable open solution checker,-0.25,-0.25,5
4,5,user name change,Misc,user name change,"[user, name, change]",0,"[user, name, change]","[user, name, change]",user name change,0.0,0.0,3


In [155]:
train.text_digit_cnt.value_counts()

0    3642
1     162
2      28
4       1
6       1
Name: text_digit_cnt, dtype: int64

In [156]:
pd.pivot_table(train, index = "text_digit_cnt", columns = "Domain", values = "ID",aggfunc="count")

Domain,Career,Hackathons,Misc,Other,Resources,Techniques,Tools
text_digit_cnt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,411.0,230.0,64.0,116.0,151.0,1795.0,875.0
1,26.0,28.0,9.0,5.0,15.0,45.0,34.0
2,,4.0,,1.0,4.0,12.0,7.0
4,,,,,,,1.0
6,,,1.0,,,,


In [157]:
test.head()

Unnamed: 0,ID,Title,original_title,text_words,text_digit_cnt,clean_text_lem,clean_text_wrds,clean_text,title_polarity,text_polarity,ttl_wrds
0,3846,spark cheatsheet,spark cheatsheet,"[spark, cheatsheet]",0,"[spark, cheatsheet]","[spark, cheatsheet]",spark cheatsheet,0.0,0.0,2
1,3847,review of random forest code in python,review of random forest code in python,"[review, of, random, forest, code, in, python]",0,"[review, of, random, forest, code, in, python]","[review, random, forest, code, python]",review random forest code python,-0.5,-0.5,7
2,3848,chisq test for numeric variables,chisq test for numeric variables,"[chisq, test, for, numeric, variables]",0,"[chisq, test, for, numeric, variable]","[chisq, test, numeric, variable]",chisq test numeric variable,0.0,0.0,5
3,3849,prediction from loaded pickled file for single...,prediction from loaded pickled file for single...,"[prediction, from, loaded, pickled, file, for,...",0,"[prediction, from, loaded, pickled, file, for,...","[prediction, loaded, pickled, file, single, in...",prediction loaded pickled file single instance...,-0.071429,-0.071429,10
4,3850,even after installing anaconda on my pc i am u...,even after installing anaconda on my pc i am u...,"[even, after, installing, anaconda, on, my, pc...",0,"[even, after, installing, anaconda, on, my, pc...","[even, installing, anaconda, pc, unable, acces...",even installing anaconda pc unable access jupy...,-0.5,-0.5,18


In [158]:
pd.pivot_table(test, index = "text_digit_cnt", values = "ID",aggfunc="count")

Unnamed: 0_level_0,ID
text_digit_cnt,Unnamed: 1_level_1
0,1581
1,62
2,6


In [159]:
pd.pivot_table(train, index = "ttl_wrds", columns = "Domain", values = "ID",aggfunc="count")

Domain,Career,Hackathons,Misc,Other,Resources,Techniques,Tools
ttl_wrds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1.0,1.0,,,,3.0,
2,13.0,6.0,5.0,6.0,2.0,73.0,6.0
3,20.0,30.0,6.0,6.0,14.0,111.0,30.0
4,31.0,27.0,4.0,12.0,28.0,194.0,53.0
5,47.0,41.0,13.0,22.0,22.0,196.0,61.0
6,50.0,44.0,5.0,9.0,21.0,181.0,90.0
7,59.0,34.0,10.0,14.0,16.0,183.0,101.0
8,55.0,20.0,4.0,13.0,9.0,206.0,97.0
9,35.0,15.0,7.0,6.0,15.0,158.0,91.0
10,28.0,18.0,8.0,11.0,10.0,141.0,99.0


In [3]:
train = pd.read_csv('../data/train_cleaned.csv')
test = pd.read_csv('../data/test_cleaned.csv')

In [5]:
train['char_count'] = train['clean_text'].apply(len)
train['word_density'] = train['char_count'] / (train['ttl_wrds']+1)
train['punctuation_count'] = train['original_title'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
train['title_word_count'] = train['original_title'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
train['upper_case_word_count'] = train['original_title'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

test['char_count'] = test['clean_text'].apply(len)
test['word_density'] = test['char_count'] / (test['ttl_wrds']+1)
test['punctuation_count'] = test['original_title'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
test['title_word_count'] = test['original_title'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
test['upper_case_word_count'] = test['original_title'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [7]:
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

train['noun_count'] = train['original_title'].apply(lambda x: check_pos_tag(x, 'noun'))
train['verb_count'] = train['original_title'].apply(lambda x: check_pos_tag(x, 'verb'))
train['adj_count'] = train['original_title'].apply(lambda x: check_pos_tag(x, 'adj'))
train['adv_count'] = train['original_title'].apply(lambda x: check_pos_tag(x, 'adv'))
train['pron_count'] = train['original_title'].apply(lambda x: check_pos_tag(x, 'pron'))

test['noun_count'] = test['original_title'].apply(lambda x: check_pos_tag(x, 'noun'))
test['verb_count'] = test['original_title'].apply(lambda x: check_pos_tag(x, 'verb'))
test['adj_count'] = test['original_title'].apply(lambda x: check_pos_tag(x, 'adj'))
test['adv_count'] = test['original_title'].apply(lambda x: check_pos_tag(x, 'adv'))
test['pron_count'] = test['original_title'].apply(lambda x: check_pos_tag(x, 'pron'))

In [8]:
##Saving data
train.to_csv('../data/train_cleaned.csv',index= False)
test.to_csv('../data/test_cleaned.csv',index= False)