# Web Scraping, Storing, Transforming, and Modelling Stack Exchange Questions

In [182]:
from SEData.data import *
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from time import sleep
from random import randint
import os
from datetime import date

In [307]:
questions = pd.read_csv('most_recent_by_category.csv', squeeze = True, index_col=0)
links_already_scraped = pd.read_csv('links_scraped.csv', squeeze= True, index_col=0)

In [310]:
n = 100 # Size of data set
l = round(n/ len(questions)) #Will create l links for each category. Also ensures balanced labels.
print("Data set of " + str(n) + ' observations, ' + str(l) + ' for each of ' + str(len(questions)) + ' categories.')
data_links = back_generate_links(questions['url'], l)

Data set of 100 observations, 2 for each of 64 categories.


In [311]:
data_links = [i for i in data_links if i not in list(links_already_scraped)]
print(str(len(data_links)) + " new links not yet added to dataset")

10 new links not yet added to dataset


# Web Scraping (Takes a while)

In [216]:
if len(data_links) > 0:
    data = pd.DataFrame([get_text(link, pause = True) for link in data_links])


    data.dropna(axis = 0, how = 'any', inplace = True) #Drop NAs in place.
    data.columns = ['labels','text']

    data.to_csv('Full_Data_Set.csv', mode = 'a', header=True, encoding= 'utf8', columns=['labels','text'])

    links_df = pd.DataFrame({'url':data_links})
    links_df.to_csv('links_scraped.csv', mode = 'a', header=True, encoding= 'utf8')
else:
    print("No new links to process")
    


In [217]:
# Credit to: https://stackoverflow.com/questions/2104080/how-to-check-file-size-in-python
def convert_bytes(num):
    """
    this function will convert bytes to MB.... GB... etc
    """
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return "%3.1f %s" % (num, x)
        num /= 1024.0


def file_size(file_path):
    """
    this function will return the file size
    """
    if os.path.isfile(file_path):
        file_info = os.stat(file_path)
        return convert_bytes(file_info.st_size)

file_size('Full_Data_Set.csv')


'719.5 KB'

# Split training and test data

In [266]:
data = pd.read_csv('Full_Data_Set.csv', engine='python')[['labels', 'text']]

In [240]:
train_data, test_data = train_test_split(data, test_size = .20) #stratify = data['labels'])

In [277]:
data  = data.replace(to_replace='None', value=np.nan).dropna()

In [289]:
train_corpus = train_data['text']
train_labels = train_data['labels']
test_corpus = test_data['text']
test_labels = test_data['labels']

In [279]:
data.dropna(how = 'any', inplace = True)

# Transform data

In [290]:
tfidf = TfidfVectorizer(stop_words = 'english',
                        max_features = 2000,
                        ngram_range=(1,3))
train_matrix = tfidf.fit_transform(train_corpus)

AttributeError: 'NoneType' object has no attribute 'lower'

In [16]:
test_matrix = tfidf.transform(test_corpus)


# Begin modelling

##### X_train : train_matrix
##### y_train : train_labels
##### X_test : test_matrix
##### y_test : test_labels

In [20]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=100)
forest.fit(train_matrix, train_labels)

pred_labels = forest.predict(test_matrix)

In [21]:
print(accuracy_score(test_labels, pred_labels))

0.734950584007


In [19]:
pd.DataFrame({"50 Test Data Labels":test_labels[:50], "50 Predicted Data Labels":pred_labels[:50]})

Unnamed: 0,50 Predicted Data Labels,50 Test Data Labels
2893,music,music
4942,writers,writers
769,codegolf,codegolf
3918,softwareengineering,softwareengineering
2547,stats,mathematica
1039,diy,diy
1268,chemistry,ell
2183,japanese,japanese
2953,outdoors,outdoors
3487,health,russian


In [31]:
feature_array = np.array(tfidf.get_feature_names())
tfidf_sorting = np.argsort(test_matrix.toarray()).flatten()[::-1]

n = 100
top_n = feature_array[tfidf_sorting][:n]
print(top_n)

['born' 'ag' 'reasons' 'space' 'long' 'dwarf' 'example' 'broad' 'humans'
 'fear' 'planet' 'rotation' 'slow' 'office' 'red' 'various' 'night' 'hold'
 'future' 'family' 'important' 'reference' 'considered' 'ones' 'public'
 'class' 'large' 'home' 'person' 'day' 'similar' 'place' 'years' 'looking'
 'set' 'people' 'think' 'make' 'like' 'friends' 'friendly number' 'чем'
 'fruit' 'ft' 'fully' 'function' 'friendly' 'fx' 'gain' 'game' 'games'
 'gap' 'garbage' 'functions' 'free' 'friend' 'footage' 'fly' 'flying'
 'focus' 'folder' 'follow' 'followed' 'following' 'following sentence'
 'follows' 'font' 'food' 'force' 'frequency' 'forcing' 'form' 'format'
 'forms' 'formula' 'forward' 'frame' 'gau' 'freedom' 'freely' 'french'
 'gas' 'getting' 'gave' 'grid' 'group' 'groups' 'guess' 'guide'
 'guidelines' 'guidelines usage' 'guidelines usage registered' 'guitar'
 'guy' 'guys' 'hair' 'hairsp' 'half' 'hand' 'handle']
