# Web Scraping, Storing, Transforming, and Modelling Stack Exchange Questions

In [1]:
from SEData.data import *
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from time import sleep
from random import randint
import os
from datetime import date
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import itertools

In [44]:
questions = pd.read_csv('most_recent_by_category.csv', squeeze = True, index_col=0)
links_already_scraped = pd.read_csv('links_scraped.csv', squeeze= True, index_col=0)

In [64]:
n = 1000 # Size of data set
l = round(n/ len(questions)) #Will create l links for each category. Also ensures balanced labels.
print("Data set of " + str(n) + ' observations, ' + str(l) + ' for each of ' + str(len(questions)) + ' categories.')
data_links = back_generate_links(questions['url'], l)

Data set of 1000 observations, 15 for each of 65 categories.


In [65]:
data_links = [i for i in data_links if i not in list(links_already_scraped)]
print(str(len(data_links)) + " new links not yet added to dataset")

210 new links not yet added to dataset


# Web Scraping (Takes a while)

In [66]:
if len(data_links) > 0:
    data = pd.DataFrame([get_text(link, pause = True) for link in data_links])


    data.dropna(axis = 0, how = 'any', inplace = True) #Drop NAs in place.
    data.columns = ['labels','text']

    data.to_csv('Full_Data_Set.csv', mode = 'a', header=True, encoding= 'utf8', columns=['labels','text'])

    links_df = pd.DataFrame({'url':data_links})
    links_df.to_csv('links_scraped.csv', mode = 'a', header=True, encoding= 'utf8')
else:
    print("No new links to process")
    


In [67]:
# Credit to: https://stackoverflow.com/questions/2104080/how-to-check-file-size-in-python
def convert_bytes(num):
    """
    this function will convert bytes to MB.... GB... etc
    """
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return "%3.1f %s" % (num, x)
        num /= 1024.0


def file_size(file_path):
    """
    this function will return the file size
    """
    if os.path.isfile(file_path):
        file_info = os.stat(file_path)
        return convert_bytes(file_info.st_size)

file_size('Full_Data_Set.csv')


'1.8 MB'

# Split training and test data

In [68]:
# data = pd.read_csv('5000Questions.csv', engine='python')[['labels', 'text']]
data = pd.read_csv('Full_Data_Set.csv', engine='python')[['labels', 'text']]
data = data[data.isin(questions.index)['labels']] # Filter out messy data, including 'None' values

In [69]:
data['labels'].unique()

array(['academia', 'astronomy', 'aviation', 'blender', 'boardgames',
       'chemistry', 'chess', 'chinese', 'codegolf', 'codereview',
       'cooking', 'crypto', 'dba', 'diy', 'dsp', 'electronics', 'ell',
       'engineering', 'english', 'ethereum', 'gamedev', 'gaming', 'german',
       'graphicdesign', 'health', 'hinduism', 'history', 'interpersonal',
       'japanese', 'judaism', 'law', 'lifehacks', 'math', 'mathematica',
       'mathoverflow', 'money', 'movies', 'music', 'outdoors', 'parenting',
       'photo', 'physics', 'politics', 'puzzling', 'rpg', 'russian',
       'salesforce', 'scifi', 'security', 'skeptics',
       'softwareengineering', 'stats', 'tex', 'travel', 'unix', 'ux', 'vi',
       'workplace', 'worldbuilding', 'writers', 'cs', 'space'], dtype=object)

Train test split:

In [21]:
train_data, test_data = train_test_split(data, test_size = .20) #stratify = data['labels'])

In [22]:
train_corpus = train_data['text']
train_labels = train_data['labels']
test_corpus = test_data['text']
test_labels = test_data['labels']

In [23]:
data['text'][390]

' I am creating a VR game Currently Im testing it on an Android device Now I when I run the application it runs smoothly and without any issue However when I add the following model  And attempt to look at it the game crashes  The model you see doesnt have any scripts on him only a transform component and an animator Has anyone tried something similar or have an idea what the issue might be Update i have also tried this following scene So i tried to isolate the character completely from my original scene So i made this   Again if i have the character in the scene it crashes if i remove the character it runs perfectly fine  Another update If i remove the animator component from the model it works fine '

# Transform data

In [24]:
tfidf = TfidfVectorizer(stop_words = 'english',
                        max_features = 2000,
                        ngram_range=(1,3))
train_matrix = tfidf.fit_transform(train_corpus)

In [25]:
test_matrix = tfidf.transform(test_corpus)


# Begin modelling

##### X_train : train_matrix
##### y_train : train_labels
##### X_test : test_matrix
##### y_test : test_labels

In [26]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(max_features = 'auto', n_estimators=500)
forest.fit(train_matrix, train_labels)

pred_labels = forest.predict(test_matrix)

In [27]:
print(accuracy_score(test_labels, pred_labels))

0.63133640553


## Grid Search Cross Validation to Tune Hyper-parameters of Random Forest

In [28]:
from sklearn.model_selection  import GridSearchCV

In [30]:
param_grid = {'n_estimators' : [10, 100, 500], 'max_features' :  ['auto', 'sqrt', 'log2']}

CV_forest = GridSearchCV(estimator = forest, param_grid = param_grid, cv = 5)

CV_forest.fit(train_matrix, train_labels)

print(CV_forest.best_params_)
print(CV_forest.best_score_)

In [70]:
forest = CV_forest.best_estimator_

forest.fit(train_matrix, train_labels)

pred_labels = forest.predict(test_matrix)

print(accuracy_score(test_labels, pred_labels))

0.645161290323


# Visualizing Model Results

In [42]:
pd.DataFrame({"50 Test Data Labels":test_labels[:50], "50 Predicted Data Labels":pred_labels[:50]})

Unnamed: 0,50 Predicted Data Labels,50 Test Data Labels
806,skeptics,skeptics
470,law,history
1925,dsp,dsp
579,math,mathoverflow
1352,tex,tex
2017,math,gaming
975,mathematica,mathematica
2605,workplace,workplace
309,dsp,dsp
1854,crypto,crypto


In [43]:
feature_array = np.array(tfidf.get_feature_names())
tfidf_sorting = np.argsort(test_matrix.toarray()).flatten()[::-1]

n = 100
top_n = feature_array[tfidf_sorting][:n]
print(top_n)

['clean' 'easily' 'wear' 'alternative' 'want' 'exists' 'looks like'
 'dont want' 'looks' 'looking' 'used' 'make' 'dont' 'know' 'like' 'finding'
 'finished' 'fine' 'forces' 'font' 'fit' 'finally' 'form' 'formal' 'final'
 'filter' 'finite' 'fixed' 'fix' 'follows' 'force' 'flight' 'flights'
 'files' 'flow' 'foot' 'folder' 'food' 'folders' 'follow' 'following'
 'floor' 'ð¾ñ' 'file' 'figure' 'factors' 'fail' 'failed' 'fairly' 'fake'
 'fall' 'false' 'family' 'far' 'fast' 'faster' 'fat' 'father' 'feature'
 'features' 'feed' 'feel' 'feel like' 'feeling' 'feet' 'felt' 'female'
 'fiancã' 'field' 'fields' 'fight' 'formation' 'format' 'freedom' 'formula'
 'generate' 'german' 'gets' 'getting' 'girl' 'given' 'gives' 'giving'
 'goal' 'goes' 'going' 'good' 'google' 'got' 'government' 'gpgagent'
 'gpgagent dbg' 'gpgagent dbg chan' 'granted' 'graph' 'great' 'greatly'
 'green' 'ground' 'group']


## Plotting a confusion Matrix:

In [54]:
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [60]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(test_labels, pred_labels)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure(figsize=(24,24))
plot_confusion_matrix(cnf_matrix, classes=questions.index,
                      title='Confusion matrix, without normalization')

# # Plot normalized confusion matrix
# plt.figure()
# plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
#                       title='Normalized confusion matrix')

# plt.show()
plt.savefig('ConfusionMatrix.jpg')

Confusion matrix, without normalization
[[6 0 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 [0 0 7 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 3 0 0]
 [0 0 0 ..., 0 6 0]
 [0 0 0 ..., 0 0 3]]
