# Web Scraping and Text Classification of Questions Asked on Stack Exchange

## Import our web scraping functions defined in the SEData package:

In [1]:
from SEData.data import *
import numpy as np
import pandas as pd

## Populate a list of question URLs from Stack Exchange's current "Hot Network Questions"

In [2]:
question_links = populate_question_links()

## We can use the get_text function to return a tuple of the SE category and question text corresponding to any SE question URL:

In [3]:
get_text(question_links[0])

('academia',
 ' Im writing a mathematical paper In it I use a lemma The lemma is not hard to prove and I have verified it myself The proof is too tedious to include in the paper so I want to just include a citation I found a paper that includes the result However that paper does not actually include a proof I cannot find any other place where this lemma appears I see three options  State the lemma without proof or citation State the lemma without proof but cite the paper that states the lemma without proof or citation Provide a proof of the lemma  Which is most appropriate Option  is easiest but might annoy some readers who dont believe me Option  seems like a cop out Option  is safest but I dont think its necessary as the proof is really just a long and boring calculation ADDED To be clear the lemma is basically an integral The proof consists of splitting up the domain of integration to remove absolute values evaluating each of the parts easy enough for symbolic integration packages l

## We can create a small corpus of 50 questions:

In [4]:
data = [get_text(link) for link in question_links]

In [5]:
data = pd.DataFrame(data)
data.dropna(axis = 0, how = 'any', inplace = True) #Drop NAs in place. 


#### NAs occur for a given url if there is no question text (if the question was removed by moderators) or if the page hasn't been created yet (if the ID is higher than the ID of the most recently asked quesiton within a given SE category)

In [6]:
labels = data[0]
corpus = data[1]

In [7]:
print(corpus[:4])

0     Im writing a mathematical paper In it I use a...
1     I am a childless female something who occasio...
2     Background I have been living with my boyfrie...
3     I would like to know about the time format wh...
Name: 1, dtype: object


## We can create a sparse matrix of Term Frequency, Inverse Document Frequency scores for each word in the corpus. 

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [9]:
tfidf = TfidfVectorizer(stop_words = 'english', max_features = 500) #Limit number of features at 500 words with highest Tfidf score
sparse_matrix = tfidf.fit_transform(corpus)

In [10]:
tfidf.get_feature_names()

['able',
 'absolute',
 'actually',
 'add',
 'added',
 'address',
 'adult',
 'afraid',
 'age',
 'ago',
 'alloys',
 'answer',
 'answers',
 'approx',
 'argument',
 'array',
 'ascending',
 'ascent',
 'ask',
 'asked',
 'asking',
 'assume',
 'avoid',
 'away',
 'ax',
 'background',
 'based',
 'basevalue',
 'basically',
 'battle',
 'believe',
 'best',
 'better',
 'bismuth',
 'bit',
 'block',
 'body',
 'bounce',
 'brand',
 'buy',
 'car',
 'case',
 'cases',
 'cast',
 'cdot',
 'center',
 'certain',
 'change',
 'characters',
 'child',
 'children',
 'chores',
 'citation',
 'class',
 'classoption',
 'clear',
 'closed',
 'code',
 'coffee',
 'come',
 'comes',
 'comment',
 'common',
 'company',
 'compared',
 'completely',
 'complex',
 'component',
 'components',
 'consequences',
 'constant',
 'contentaddclassoptionno',
 'cooking',
 'countries',
 'course',
 'cut',
 'date',
 'daughter',
 'day',
 'days',
 'deal',
 'decelerate',
 'decided',
 'decimal',
 'delta',
 'depends',
 'descending',
 'details',
 'dia

In [11]:
test_links = populate_stepback_links(question_links)
assert len([link for link in sample_question_links if link in test_links]) == 0

In [12]:
test_data = [get_text(link) for link in test_links]
test_data = pd.DataFrame(test_data)
test_data.dropna(axis = 0, how = 'any', inplace = True)
test_labels = test_data[0]
test_corpus = test_data[1]

test_matrix = tfidf.transform(test_corpus)


## Train a Decision Tree Model
### X_train : sparse_matrix
### y_train : labels
### X_test : test_matrix
### y_test : test_labels

## We don't expect this model to perform very well, given the small sample size of the training data, but this is for illustrative purposes.

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


In [14]:
tree = DecisionTreeClassifier()
tree.fit(sparse_matrix, labels)

pred_labels = tree.predict(test_matrix)

In [15]:
print(accuracy_score(test_labels, pred_labels))

0.0232558139535


In [16]:
pd.DataFrame({"Test Data Labels":test_labels, "Predicted Data Labels":pred_labels})

Unnamed: 0,Predicted Data Labels,Test Data Labels
0,travel,academia
2,stackoverflow,interpersonal
4,japanese,mathematica
5,travel,japanese
6,travel,unix
7,ux,electronics
8,travel,chemistry
9,travel,money
10,english,law
11,travel,puzzling
