# Running a Random Forest on A Larger Dataset

In [1]:
from SEData.data import *
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from time import sleep
from random import randint

In [2]:
questions = pd.read_csv('most_recent_by_category.csv', squeeze = True, index_col=0)

In [3]:
n = 5000 # Size of data set
l = round(n/ len(questions)) #Will create l links for each category. Also ensures balanced labels.

In [4]:
data_links = back_generate_links(questions['url'], l)


In [5]:
data = pd.DataFrame([get_text(link, pause = True) for link in data_links])


In [6]:
data.dropna(axis = 0, how = 'any', inplace = True) #Drop NAs in place.
data.columns = ['labels','text']

In [7]:
train_data, test_data = train_test_split(data, test_size = .25) #stratify = data['labels'])

In [8]:
train_corpus = train_data['text']
train_labels = train_data['labels']
test_corpus = test_data['text']
test_labels = test_data['labels']

In [15]:
tfidf = TfidfVectorizer(stop_words = 'english', max_features = 2000, ngram_range=(1,3)) #Limit number of features at 500 words with highest Tfidf score
train_matrix = tfidf.fit_transform(train_corpus)

In [16]:
test_matrix = tfidf.transform(test_corpus)


## Train a Decision Tree Model
### X_train : train_matrix
### y_train : train_labels
### X_test : test_matrix
### y_test : test_labels

In [20]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=100)
forest.fit(train_matrix, train_labels)

pred_labels = forest.predict(test_matrix)

In [21]:
print(accuracy_score(test_labels, pred_labels))

0.734950584007


In [19]:
pd.DataFrame({"50 Test Data Labels":test_labels[:50], "50 Predicted Data Labels":pred_labels[:50]})

Unnamed: 0,50 Predicted Data Labels,50 Test Data Labels
2893,music,music
4942,writers,writers
769,codegolf,codegolf
3918,softwareengineering,softwareengineering
2547,stats,mathematica
1039,diy,diy
1268,chemistry,ell
2183,japanese,japanese
2953,outdoors,outdoors
3487,health,russian


In [31]:
feature_array = np.array(tfidf.get_feature_names())
tfidf_sorting = np.argsort(test_matrix.toarray()).flatten()[::-1]

n = 100
top_n = feature_array[tfidf_sorting][:n]
print(top_n)

['born' 'ag' 'reasons' 'space' 'long' 'dwarf' 'example' 'broad' 'humans'
 'fear' 'planet' 'rotation' 'slow' 'office' 'red' 'various' 'night' 'hold'
 'future' 'family' 'important' 'reference' 'considered' 'ones' 'public'
 'class' 'large' 'home' 'person' 'day' 'similar' 'place' 'years' 'looking'
 'set' 'people' 'think' 'make' 'like' 'friends' 'friendly number' 'чем'
 'fruit' 'ft' 'fully' 'function' 'friendly' 'fx' 'gain' 'game' 'games'
 'gap' 'garbage' 'functions' 'free' 'friend' 'footage' 'fly' 'flying'
 'focus' 'folder' 'follow' 'followed' 'following' 'following sentence'
 'follows' 'font' 'food' 'force' 'frequency' 'forcing' 'form' 'format'
 'forms' 'formula' 'forward' 'frame' 'gau' 'freedom' 'freely' 'french'
 'gas' 'getting' 'gave' 'grid' 'group' 'groups' 'guess' 'guide'
 'guidelines' 'guidelines usage' 'guidelines usage registered' 'guitar'
 'guy' 'guys' 'hair' 'hairsp' 'half' 'hand' 'handle']


In [30]:
questions.head()

Unnamed: 0_level_0,id,url
category,Unnamed: 1_level_1,Unnamed: 2_level_1
academia,96491,https://academia.stackexchange.com/questions/9...
askubuntu,959520,https://askubuntu.com/questions/959520/how-can...
aviation,44071,https://aviation.stackexchange.com/questions/4...
blender,90787,https://blender.stackexchange.com/questions/90...
boardgames,38511,https://boardgames.stackexchange.com/questions...


In [35]:
from tpot import TPOTClassifier

In [None]:
pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
                                    random_state=1, verbosity=2, max_time_mins=5)
pipeline_optimizer.fit(train_corpus, train_labels)
print(pipeline_optimizer.score(test_corpus, test_labels))