# Web Scraping and Text Classification of Questions Asked on Stack Exchange

## Import our web scraping functions defined in the SEData package:

In [1]:
from SEData.data import *
import numpy as np
import pandas as pd

## Populate a list of question URLs from Stack Exchange's current "Hot Network Questions"
## Save list of categories from those links, and append to existing .csv file of SE categories

In [2]:
question_links = populate_question_links()
categories = [find_cat(i) for i in question_links]


former_categories = list(pd.read_csv('categories.csv', squeeze = True, index_col=0))
categories = set(categories + former_categories)
pd.DataFrame(list(categories)).to_csv('categories.csv')
del former_categories

## We can use the get_text function to return a tuple of the SE category and quesiton text corresponding to any SE question URL:

In [3]:
get_text(question_links[0])

('english',
 ' Weve already chosen the domain name for a new info blog website called InfoToss In US English it doesnt have any crazy slang meaning and we didnt think to check for other regional meanings because we didnt think we would expand so we went ahead and designed the site and logo Things have changed and now wed like to expand and target a UK audience as well but after doing more research weve learned that the meaning of the word toss in British slang can mean something other than just throwing something see Urban Dictionary if you dont know  So my question is for those British English speakers  is it overly innappropriate for the domain to be InfoToss Or would it be just a little edgey ')

## We can create a small corpus of 50 questions:

In [4]:
data = [get_text(link) for link in sample_question_links]

#### sample_question_links is just a static list of 50 URLs pointing to the most popular Stack Exchange questions on 9/13/17

In [5]:
data = pd.DataFrame(data)
data.dropna(axis = 0, how = 'any', inplace = True) #Drop NAs in place. 


#### NAs occur for a given url if there is no question text (if the question was removed by moderators) or if the page hasn't been created yet (if the ID is higher than the ID of the most recently asked quesiton within a given SE category)

In [6]:
labels = data[0]
corpus = data[1]

In [7]:
print(corpus[:4])

0     I have a Raspberry Pi  and I would like to us...
1      This question already has an answer here   R...
2     We are currently building a networking course...
3     The user starts an action and afterwards the ...
Name: 1, dtype: object


## We can create a sparse matrix of Term Frequency, Inverse Document Frequency scores for each word in the corpus. 

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [9]:
tfidf = TfidfVectorizer(stop_words = 'english', max_features = 500) #Limit number of features at 500 words with highest Tfidf score
sparse_matrix = tfidf.fit_transform(corpus)

In [10]:
tfidf.get_feature_names()

['able',
 'account',
 'actually',
 'add',
 'advice',
 'america',
 'amp',
 'answer',
 'appears',
 'appreciate',
 'appreciated',
 'appropriate',
 'arent',
 'armis',
 'artifact',
 'ask',
 'asked',
 'asking',
 'aspects',
 'attack',
 'available',
 'avoid',
 'away',
 'bad',
 'behavior',
 'believe',
 'best',
 'billable',
 'bit',
 'blueborne',
 'bluetooth',
 'bodies',
 'book',
 'box',
 'break',
 'building',
 'bus',
 'cable',
 'cancel',
 'car',
 'case',
 'cases',
 'category',
 'causes',
 'celestial',
 'change',
 'circuit',
 'civilisation',
 'class',
 'code',
 'colonies',
 'colonisation',
 'common',
 'communicate',
 'company',
 'complete',
 'condition',
 'conjecture',
 'connect',
 'conservative',
 'considered',
 'const',
 'contradiction',
 'control',
 'country',
 'course',
 'created',
 'credentials',
 'critical',
 'cultist',
 'currently',
 'deal',
 'deep',
 'details',
 'developer',
 'device',
 'devices',
 'did',
 'didnt',
 'different',
 'difficult',
 'directly',
 'documentation',
 'does',
 'does

In [11]:
test_links = populate_stepback_links(sample_question_links)
assert len([link for link in sample_question_links if link in test_links]) == 0

In [14]:
test_data = [get_text(link) for link in test_links]
test_data = pd.DataFrame(test_data)
test_data.dropna(axis = 0, how = 'any', inplace = True)
test_labels = test_data[0]
test_corpus = test_data[1]

test_matrix = tfidf.transform(test_corpus)


## Train a Decision Tree Model
### X_train : sparse_matrix
### y_train : labels
### X_test : test_matrix
### y_test : test_labels

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


In [20]:
tree = DecisionTreeClassifier()
tree.fit(sparse_matrix, labels)

pred_labels = tree.predict(test_matrix)

In [22]:
print(accuracy_score(test_labels, pred_labels))

0.0


In [26]:
pd.DataFrame({"Test Data Labels":test_labels, "Predicted Data Labels":pred_labels})

Unnamed: 0,Predicted Data Labels,Test Data Labels
0,rpg,raspberrypi
1,interpersonal,worldbuilding
2,scifi,networkengineering
3,worldbuilding,ux
4,scifi,cooking
5,worldbuilding,math
6,scifi,gamedev
7,politics,academia
8,worldbuilding,politics
9,stackoverflow,math
