# Web Scraping, Storing, Transforming, and Modelling Stack Exchange Questions

In [1]:
from SEData.data import *
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from time import sleep
from random import randint
import os
from datetime import date

In [53]:
questions = pd.read_csv('most_recent_by_category.csv', squeeze = True, index_col=0)
links_already_scraped = pd.read_csv('links_scraped.csv', squeeze= True, index_col=0)

In [58]:
n = 1000 # Size of data set
l = round(n/ len(questions)) #Will create l links for each category. Also ensures balanced labels.
print("Data set of " + str(n) + ' observations, ' + str(l) + ' for each of ' + str(len(questions)) + ' categories.')
data_links = back_generate_links(questions['url'], l)

Data set of 1000 observations, 15 for each of 65 categories.


In [59]:
data_links = [i for i in data_links if i not in list(links_already_scraped)]
print(str(len(data_links)) + " new links not yet added to dataset")

385 new links not yet added to dataset


# Web Scraping (Takes a while)

In [None]:
%%timeit
if len(data_links) > 0:
    data = pd.DataFrame([get_text(link, pause = True) for link in data_links])


    data.dropna(axis = 0, how = 'any', inplace = True) #Drop NAs in place.
    data.columns = ['labels','text']

    data.to_csv('Full_Data_Set.csv', mode = 'a', header=True, encoding= 'utf8', columns=['labels','text'])

    links_df = pd.DataFrame({'url':data_links})
    links_df.to_csv('links_scraped.csv', mode = 'a', header=True, encoding= 'utf8')
else:
    print("No new links to process")
    


In [57]:
# Credit to: https://stackoverflow.com/questions/2104080/how-to-check-file-size-in-python
def convert_bytes(num):
    """
    this function will convert bytes to MB.... GB... etc
    """
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return "%3.1f %s" % (num, x)
        num /= 1024.0


def file_size(file_path):
    """
    this function will return the file size
    """
    if os.path.isfile(file_path):
        file_info = os.stat(file_path)
        return convert_bytes(file_info.st_size)

file_size('Full_Data_Set.csv')


'759.7 KB'

# Split training and test data

In [40]:
# data = pd.read_csv('5000Questions.csv', engine='python')[['labels', 'text']]
data = pd.read_csv('Full_Data_Set.csv', engine='python')[['labels', 'text']]
data = data[data.isin(questions.index)['labels']] # Filter out messy data, including 'None' values

In [42]:
data['labels'].unique()

array(['academia', 'astronomy', 'aviation', 'blender', 'boardgames',
       'chemistry', 'chess', 'chinese', 'codegolf', 'codereview',
       'cooking', 'crypto', 'dba', 'diy', 'dsp', 'electronics', 'ell',
       'engineering', 'english', 'ethereum', 'gamedev', 'gaming', 'german',
       'graphicdesign', 'health', 'hinduism', 'history', 'interpersonal',
       'japanese', 'judaism', 'law', 'lifehacks', 'math', 'mathematica',
       'mathoverflow', 'money', 'movies', 'music', 'outdoors', 'parenting',
       'photo', 'physics', 'politics', 'puzzling', 'rpg', 'russian',
       'salesforce', 'scifi', 'security', 'skeptics',
       'softwareengineering', 'stats', 'tex', 'travel', 'unix', 'ux', 'vi',
       'workplace', 'worldbuilding', 'writers'], dtype=object)

Train test split:

In [43]:
train_data, test_data = train_test_split(data, test_size = .20) #stratify = data['labels'])

In [44]:
train_corpus = train_data['text']
train_labels = train_data['labels']
test_corpus = test_data['text']
test_labels = test_data['labels']

In [45]:
data['text'][390]

' I am creating a VR game Currently Im testing it on an Android device Now I when I run the application it runs smoothly and without any issue However when I add the following model  And attempt to look at it the game crashes  The model you see doesnt have any scripts on him only a transform component and an animator Has anyone tried something similar or have an idea what the issue might be Update i have also tried this following scene So i tried to isolate the character completely from my original scene So i made this   Again if i have the character in the scene it crashes if i remove the character it runs perfectly fine  Another update If i remove the animator component from the model it works fine '

# Transform data

In [46]:
tfidf = TfidfVectorizer(stop_words = 'english',
                        max_features = 2000,
                        ngram_range=(1,3))
train_matrix = tfidf.fit_transform(train_corpus)

In [47]:
test_matrix = tfidf.transform(test_corpus)


# Begin modelling

##### X_train : train_matrix
##### y_train : train_labels
##### X_test : test_matrix
##### y_test : test_labels

In [48]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(max_features = 'sqrt', n_estimators=100)
forest.fit(train_matrix, train_labels)

pred_labels = forest.predict(test_matrix)

In [49]:
print(accuracy_score(test_labels, pred_labels))

0.517766497462


## Grid Search Cross Validation to Tune Hyper-parameters of Random Forest

In [None]:
from sklearn.model_selection  import GridSearchCV

In [None]:
param_grid = {'n_estimators' = [10, 100, 500], 'max_features' =  ['auto', 'sqrt', 'log2']}

In [None]:
CV_forest = GridSearchCV(estimator = forest, param_grid = param_grid, cv = 5)

In [None]:
CV_forest.fit(train_matrix, train_labels)
print(CV_rfc.best_params_)

In [None]:
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
# Build a classification task using 3 informative features
X, y = make_classification(n_samples=1000,
                           n_features=10,
                           n_informative=3,
                           n_redundant=0,
                           n_repeated=0,
                           n_classes=2,
                           random_state=0,
                           shuffle=False)


rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True) 

param_grid = { 
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X, y)
print CV_rfc.best_params_

# Visualizing Model Results

In [50]:
pd.DataFrame({"50 Test Data Labels":test_labels[:50], "50 Predicted Data Labels":pred_labels[:50]})

Unnamed: 0,50 Predicted Data Labels,50 Test Data Labels
334,travel,ell
153,workplace,aviation
936,writers,writers
703,ethereum,puzzling
185,chess,boardgames
592,lifehacks,money
324,dsp,electronics
445,health,health
193,mathematica,chemistry
729,russian,russian


In [51]:
feature_array = np.array(tfidf.get_feature_names())
tfidf_sorting = np.argsort(test_matrix.toarray()).flatten()[::-1]

n = 100
top_n = feature_array[tfidf_sorting][:n]
print(top_n)

['double' 'combination' 'listed' 'lists' 'tests' 'paragraph' 'chosen'
 'beginning' 'relevant' 'legal' 'addition' 'equivalent' 'smart'
 'specifically' 'ones' 'normal' 'taken' 'far' 'makes' 'specific' 'little'
 'possible' 'problem' 'new' 'greater' 'graphics' 'graveyard' 'graphic'
 'great' 'government' 'got' 'ƒã' 'greatestprimefactori' 'green' 'ground'
 'group' 'good' 'group group' 'groups' 'grow' 'grow new' 'grow new teeth'
 'grown' 'grown stuff' 'growth' 'guarantee' 'google' 'girl' 'gone' 'going'
 'fx' 'fã¼r' 'gain' 'game' 'games' 'gamma' 'gas' 'gear' 'gear solid'
 'gear solid phantom' 'general' 'generally' 'generaterandomnumbers'
 'generation' 'generic' 'genotypefile' 'germany' 'gets' 'getting' 'guess'
 'given' 'gives' 'goal' 'god' 'goes' 'guard' 'hand' 'gun'
 'ho cprogram files' 'helps' 'heres' 'hes' 'hex' 'high' 'highend'
 'highend car' 'higher' 'history' 'hit' 'ho' 'ho cprogram' 'hold' 'hell'
 'holding' 'hole' 'home' 'homework' 'hope' 'hopefully' 'horrible']
