#  <span style="color:#3366ff"> <strong> Tags Proposal</strong>

#  <span style="color:#3366ff"> <strong> 20180911 (Part 1 Cleaning)

<span style="color:#3366ff"> This notebook is dedicated to the StackExchange Database cleaning:   
<span style="color:#3366ff"> - 5000 lines sampling;  
<span style="color:#3366ff"> - Beautifulsoup : html code removal;    
<span style="color:#3366ff"> - stopwords removal (english most common words);  
<span style="color:#3366ff"> - Lemmatization;         
<span style="color:#3366ff"> - Duplicate words removal phase;      
<span style="color:#3366ff"> - Nan removal.     

### Librairies Import

In [1]:
import pandas as pd

from bs4 import BeautifulSoup 

import nltk
from nltk.corpus import stopwords # Import the stop word list
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import re

import string

### Database loading

In [2]:
data_text_raw1 = pd.read_csv(
    'DATA/QueryResults213000.csv', 
    #sep='\t', 
    encoding='utf-8', 
    low_memory = False)

data_text_raw1.head()

Unnamed: 0,Id,Body,Title,Tags
0,4,<p>I want to use a track-bar to change a form'...,Convert Decimal to Double?,<c#><floating-point><type-conversion><double><...
1,6,<p>I have an absolutely positioned <code>div</...,Percentage width child element in absolutely p...,<html><css><css3><internet-explorer-7>
2,9,<p>Given a <code>DateTime</code> representing ...,How do I calculate someone's age in C#?,<c#><.net><datetime>
3,11,<p>Given a specific <code>DateTime</code> valu...,Calculate relative time in C#,<c#><datetime><time><datediff><relative-time-s...
4,13,<p>Is there any standard way for a Web Server ...,Determine a User's Timezone,<javascript><html><browser><timezone><timezone...


In [3]:
# Concat Body + Title = Text
data_text_raw1['Text'] = data_text_raw1['Body'] + data_text_raw1['Title']

data_text_raw = data_text_raw1.filter(items=(['Text', 'Tags']))

shaperaw = data_text_raw.shape

print("This database contains",shaperaw[0], 
      "questions/titles  + tags.")

This database contains 26683 questions/titles  + tags.


In [4]:
pd.options.display.max_columns=999
data_text_raw.head(10)

Unnamed: 0,Text,Tags
0,<p>I want to use a track-bar to change a form'...,<c#><floating-point><type-conversion><double><...
1,<p>I have an absolutely positioned <code>div</...,<html><css><css3><internet-explorer-7>
2,<p>Given a <code>DateTime</code> representing ...,<c#><.net><datetime>
3,<p>Given a specific <code>DateTime</code> valu...,<c#><datetime><time><datediff><relative-time-s...
4,<p>Is there any standard way for a Web Server ...,<javascript><html><browser><timezone><timezone...
5,"<p>What is the difference between <a href=""htt...",<.net><math>
6,<p>How do you expose a LINQ query as an ASMX w...,<c#><linq><web-services><.net-3.5>
7,"<p>How do I store binary data in <a href=""http...",<mysql><database><binary-data><data-storage>
8,<p>I'm looking for the fastest way to obtain t...,<performance><algorithm><language-agnostic><un...
9,<p>If I have a <code>trigger</code> <code>befo...,<mysql><database><triggers>


# 10000 lines Sampled database for Test

In [5]:
data_test = data_text_raw.head(10000)

In [6]:
shapetest = data_test.shape

print("This SAMPLED database contains",shapetest[0], 
      "questions/titles + tags to be cleaned.")

This SAMPLED database contains 10000 questions/titles + tags to be cleaned.


In [7]:
#data_test = data_text_raw

<span style="color:#3366ff"> This table contains a lot of html code and other symobols --> To be cleaned

# <span style="color:#6600cc"> Part 1: "Text" X CLEANING  - - - - - - - - - - - - - - - 

### Html code removing in "Text"

In [8]:
# BeautifulSoup Applied on all the table
data = [BeautifulSoup(x).get_text() for x in data_test["Text"]]
# "Text" LOWer
data = [x.lower() for x in data]
# "Text" Punctuation removal
data = [x.translate(string.punctuation) for x in data]
# Last Symbols removal
data = [re.sub('[^a-zA-Z]',' ',x) for x in data]

Text_tmp = pd.DataFrame(data, columns = ['Text_tmp_soup'])
Text_tmp.head()



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Unnamed: 0,Text_tmp_soup
0,i want to use a track bar to change a form s o...
1,i have an absolutely positioned div containing...
2,given a datetime representing a person s birth...
3,given a specific datetime value how do i disp...
4,is there any standard way for a web server to ...


### Stopwords removal

In [9]:
# english words over used
stops = set(stopwords.words('english'))

In [10]:
def tok(x):
    kkk = word_tokenize(x)
    ttt = [w for w in kkk if w not in stops]
    return ttt

In [11]:
data = [" ".join(tok(x)) for x in Text_tmp['Text_tmp_soup']]
Text_tmp = pd.DataFrame(data, columns = ['Text_tmp_nan_double'])
Text_tmp.head(10)

Unnamed: 0,Text_tmp_nan_double
0,want use track bar change form opacity code de...
1,absolutely positioned div containing several c...
2,given datetime representing person birthday ca...
3,given specific datetime value display relative...
4,standard way web server able determine user ti...
5,difference math floor math truncate net differ...
6,expose linq query asmx web service usually bus...
7,store binary data mysql binary data mysql
8,looking fastest way obtain value personal chal...
9,trigger update table throw error prevents upda...


In [12]:
data_text_tmp = pd.concat([Text_tmp['Text_tmp_nan_double'], data_test["Tags"]], axis = 1)
data_text_tmp.head(10)

Unnamed: 0,Text_tmp_nan_double,Tags
0,want use track bar change form opacity code de...,<c#><floating-point><type-conversion><double><...
1,absolutely positioned div containing several c...,<html><css><css3><internet-explorer-7>
2,given datetime representing person birthday ca...,<c#><.net><datetime>
3,given specific datetime value display relative...,<c#><datetime><time><datediff><relative-time-s...
4,standard way web server able determine user ti...,<javascript><html><browser><timezone><timezone...
5,difference math floor math truncate net differ...,<.net><math>
6,expose linq query asmx web service usually bus...,<c#><linq><web-services><.net-3.5>
7,store binary data mysql binary data mysql,<mysql><database><binary-data><data-storage>
8,looking fastest way obtain value personal chal...,<performance><algorithm><language-agnostic><un...
9,trigger update table throw error prevents upda...,<mysql><database><triggers>


### "Text_OK" Dropna

In [13]:
data_text_drop = data_text_tmp.loc[data_text_tmp.Text_tmp_nan_double.str.len()>0]

data_text_drop = pd.DataFrame(data_text_drop , columns = ['Text_tmp_nan_double', 'Tags']).reset_index()
data_text_drop = data_text_drop.filter(items=(['Text_tmp_nan_double', 'Tags']))
data_text_drop.head(15)

Unnamed: 0,Text_tmp_nan_double,Tags
0,want use track bar change form opacity code de...,<c#><floating-point><type-conversion><double><...
1,absolutely positioned div containing several c...,<html><css><css3><internet-explorer-7>
2,given datetime representing person birthday ca...,<c#><.net><datetime>
3,given specific datetime value display relative...,<c#><datetime><time><datediff><relative-time-s...
4,standard way web server able determine user ti...,<javascript><html><browser><timezone><timezone...
5,difference math floor math truncate net differ...,<.net><math>
6,expose linq query asmx web service usually bus...,<c#><linq><web-services><.net-3.5>
7,store binary data mysql binary data mysql,<mysql><database><binary-data><data-storage>
8,looking fastest way obtain value personal chal...,<performance><algorithm><language-agnostic><un...
9,trigger update table throw error prevents upda...,<mysql><database><triggers>


In [14]:
shapetext = data_text_drop.shape

print("This SAMPLED & CLEANED database contains",shapetext[0], 
      "questions/titles  + tags.")

This SAMPLED & CLEANED database contains 10000 questions/titles  + tags.


### "Text" Lemmatization

In [15]:
wnl = WordNetLemmatizer()

def lemm(x):
    lemlist = []
    [lemlist.append(wnl.lemmatize(x))]
    return lemlist

In [16]:
lemmi = [lemm(x) for x in data_text_drop["Text_tmp_nan_double"]]
lemmi = pd.DataFrame(lemmi, columns=['Text_tmp_double'])
lemmi.head(15)

Unnamed: 0,Text_tmp_double
0,want use track bar change form opacity code de...
1,absolutely positioned div containing several c...
2,given datetime representing person birthday ca...
3,given specific datetime value display relative...
4,standard way web server able determine user ti...
5,difference math floor math truncate net differ...
6,expose linq query asmx web service usually bus...
7,store binary data mysql binary data mysql
8,looking fastest way obtain value personal chal...
9,trigger update table throw error prevents upda...


In [17]:
data_text_lemm = pd.concat([lemmi['Text_tmp_double'], data_text_drop["Tags"]], axis = 1)
data_text_lemm.head(10)

Unnamed: 0,Text_tmp_double,Tags
0,want use track bar change form opacity code de...,<c#><floating-point><type-conversion><double><...
1,absolutely positioned div containing several c...,<html><css><css3><internet-explorer-7>
2,given datetime representing person birthday ca...,<c#><.net><datetime>
3,given specific datetime value display relative...,<c#><datetime><time><datediff><relative-time-s...
4,standard way web server able determine user ti...,<javascript><html><browser><timezone><timezone...
5,difference math floor math truncate net differ...,<.net><math>
6,expose linq query asmx web service usually bus...,<c#><linq><web-services><.net-3.5>
7,store binary data mysql binary data mysql,<mysql><database><binary-data><data-storage>
8,looking fastest way obtain value personal chal...,<performance><algorithm><language-agnostic><un...
9,trigger update table throw error prevents upda...,<mysql><database><triggers>


### "Text" Duplicate words removal

In [18]:
def unique(l):
    ulist = []
    [ulist.append(x) for x in l if x not in ulist]
    uniq = ' '.join(ulist)
    return uniq

In [19]:
unik = [unique(x.split()) for x in data_text_lemm["Text_tmp_double"]]

In [20]:
unik = pd.DataFrame(unik, columns= ['Text_OK'])
unik.head(10)

Unnamed: 0,Text_OK
0,want use track bar change form opacity code de...
1,absolutely positioned div containing several c...
2,given datetime representing person birthday ca...
3,given specific datetime value display relative...
4,standard way web server able determine user ti...
5,difference math floor truncate net
6,expose linq query asmx web service usually bus...
7,store binary data mysql
8,looking fastest way obtain value personal chal...
9,trigger update table throw error prevents mysql


# <span style="color:#6600cc"> Part 2 : "Tags" y Cleaning - - - - - - - - - - - - - - - - - - - - - - - - - 

### Html code removing in Tags

In [21]:
# symbol removing
data_tags = [x.replace('<', ' ').replace('>', ' ') for x in data_text_lemm['Tags']]
# "Tags" LOWer
data_tags = [x.lower() for x in data_tags]

Tags_tmp = pd.DataFrame(data_tags, columns = ['Tags_OK'])
Tags_tmp.head()

Unnamed: 0,Tags_OK
0,c# floating-point type-conversion double ...
1,html css css3 internet-explorer-7
2,c# .net datetime
3,c# datetime time datediff relative-time-s...
4,javascript html browser timezone timezone...


In [22]:
data_text_OK_tags_OK = pd.concat([unik['Text_OK'], Tags_tmp["Tags_OK"]], axis = 1)
data_text_OK_tags_OK.head(10)

Unnamed: 0,Text_OK,Tags_OK
0,want use track bar change form opacity code de...,c# floating-point type-conversion double ...
1,absolutely positioned div containing several c...,html css css3 internet-explorer-7
2,given datetime representing person birthday ca...,c# .net datetime
3,given specific datetime value display relative...,c# datetime time datediff relative-time-s...
4,standard way web server able determine user ti...,javascript html browser timezone timezone...
5,difference math floor truncate net,.net math
6,expose linq query asmx web service usually bus...,c# linq web-services .net-3.5
7,store binary data mysql,mysql database binary-data data-storage
8,looking fastest way obtain value personal chal...,performance algorithm language-agnostic un...
9,trigger update table throw error prevents mysql,mysql database triggers


In [23]:
data_text_OK_tags_OK.shape

(10000, 2)

# Database saving

<strong> split for final test

In [24]:
limite = (data_text_OK_tags_OK.shape[0])*80/100
limite = int(limite)
limite

8000

In [25]:
data_text_FULL = data_text_OK_tags_OK[:limite]
data_text_FULL.shape

(8000, 2)

In [26]:
data_text_FULL_test = data_text_OK_tags_OK[limite:]
data_text_FULL_test.shape

(2000, 2)

In [27]:
data_text_FULL.to_csv(path_or_buf='DATA_clean/20180924_data_Text.csv', 
                        index=True)

In [28]:
data_text_FULL_test.to_csv(path_or_buf='DATA_clean/20180924_data_Text_test.csv', 
                        index=True)