In [48]:
import re
import pandas as pd
import numpy as np

In [9]:
# # Saves dictionary in same format
# np.save('my_file.npy', all_desc) 

# # # Loads dictionary in same format so we don't have to webscrape all the time
read_dictionary = np.load('my_file.npy').item()

In [10]:
edu  = read_dictionary

In [75]:
edu.keys()

dict_keys(['EDUCATION', 'BUSINESS', 'DATING', 'SPORTS', 'WEATHER', 'FOOD_AND_DRINK', 'HEALTH_AND_FITNESS', 'BEAUTY', 'MUSIC_AND_AUDIO', 'NEWS_AND_MAGAZINES', 'SOCIAL', 'SHOPPING', 'PRODUCTIVITY', 'PHOTOGRAPHY', 'MEDICAL', 'PARENTING', 'COMMUNICATION', 'TOOLS'])

# TDIDF

In [12]:
import nltk
import sklearn

from nltk.collocations import *
from nltk import FreqDist, word_tokenize
import string, re
from nltk.stem.snowball import SnowballStemmer

In [76]:
# patter to capture using regex
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"

In [77]:
# stop words
from nltk.corpus import stopwords
stopwords.words("english")

stop_words = set(stopwords.words('english'))

In [78]:
# stem words
stemmer = SnowballStemmer("english")

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [79]:
def text_cleaner(description):
    ''''''
    tokens_raw = nltk.regexp_tokenize(description, pattern)
    tokens = [i.lower() for i in tokens_raw]
    tokens_stopped = [w for w in tokens if not w in stop_words]
    stemmed = [stemmer.stem(word) for word in tokens_stopped]
    cleaned = ' '.join(stemmed)
    return cleaned

In [80]:
def dict_cleaner(dictionary):
    description_list = []
    for c, d in dictionary.items():
        for review in d:
            cleaned = text_cleaner(review)
            description_list.append(cleaned)
    return description_list

In [81]:
description_list = dict_cleaner(edu)

In [82]:
response = tfidf.fit_transform(description_list)

df = pd.DataFrame(response.toarray(), columns=tfidf.get_feature_names())


In [83]:
non_zero_cols = response.nnz / float(response.shape[0])
print("Average Number of Non-Zero Elements in Vectorized Reviews: {}".format(non_zero_cols))

percent_sparse = 1 - (non_zero_cols / float(response.shape[1]))
print('Percentage of columns containing 0: {}'.format(percent_sparse))

Average Number of Non-Zero Elements in Vectorized Reviews: 112.69537037037037
Percentage of columns containing 0: 0.9914657046292791


In [84]:
# Because we had exactly 60 items in each category this creates a list
# with 60 items of each key.
def create_labels(dictionary):
    x = dictionary.keys()
    new_list = []
    for c in x: 
        s = [c] * len(dictionary[c])
        new_list += s
    return new_list

In [85]:
labels = create_labels(edu)

In [86]:
len(new_list)

1080

In [87]:
df.shape

(1080, 13205)

In [88]:
df['labels'] = labels

In [89]:
df.tail()

Unnamed: 0,aa,aac,aaptiv,aarp,aask,ab,abajo,abandon,abc,abcmous,...,zoomterrain,zoomwithushav,zoosk,zte,zulili,zulu,zulufor,zulupermiss,zumba,labels
1075,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOOLS
1076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOOLS
1077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOOLS
1078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOOLS
1079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOOLS


In [90]:
df.describe()

Unnamed: 0,aa,aac,aaptiv,aarp,aask,ab,abajo,abandon,abc,abcmous,...,zoompres,zoomterrain,zoomwithushav,zoosk,zte,zulili,zulu,zulufor,zulupermiss,zumba
count,1080.0,1080.0,1080.0,1080.0,1080.0,1080.0,1080.0,1080.0,1080.0,1080.0,...,1080.0,1080.0,1080.0,1080.0,1080.0,1080.0,1080.0,1080.0,1080.0,1080.0
mean,3.3e-05,0.000234,0.000172,6.8e-05,6.2e-05,0.001763,1.7e-05,0.000144,0.001015,0.000834,...,0.000108,5.6e-05,0.000108,0.000713,3.7e-05,0.000225,0.000169,3.8e-05,6.4e-05,7.7e-05
std,0.001082,0.003569,0.00564,0.002229,0.002038,0.018597,0.00057,0.004748,0.013862,0.022827,...,0.00354,0.001843,0.00354,0.023415,0.001212,0.007396,0.003225,0.001261,0.002107,0.002537
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.035569,0.06642,0.185352,0.073265,0.066988,0.349607,0.018718,0.156044,0.387101,0.730989,...,0.116346,0.06058,0.116346,0.769504,0.039817,0.243071,0.071782,0.04143,0.069259,0.08337


In [91]:
new_desc = """Find your next commercial real estate deal on the go! Leverage the power of LoopNet anytime, anywhere to find your perfect investment property or lease space right from your Android device. Whether in the office or on the road, the LoopNet app keeps you productive, informed, and ahead of the competition with these features:

• Advanced Search Filters: Find the properties that meet your exact needs using specific search criteria and filtering options.
• Comprehensive Property Details: View listing details including property data, photos, videos, financials, map location, satellite images, and street view. See something you like? Contact the broker right from your phone!
• Map or List Based Search: Customize your search experience. Display and adjust your search results on a map, a list, or both.
• Saved Searches & Notifications: Save searches that matter so you can access them quickly and monitor your market day-to-day. We’ll also keep you notified about new listings that match your saved search criteria.
• Watch List & Notifications: Monitor listings of interest by adding them to your Watch List. We’ll notify you as soon as there are any changes or updates to the listings.
• Multiple Device Capability: Access your saved searches and watched listings from any device or on the web through your LoopNet account.

LoopNet is the industry leader with more traffic, listings, and geographic coverage than any other commercial real estate marketplace. Use the LoopNet app to find your next investment property or space to rent, whether office space, multifamily apartment buildings, retail, restaurant, medical, industrial, or land in all major markets in the United States and Canada including Los Angeles, New York, Chicago, San Francisco, Dallas, Houston, and Miami. """

In [92]:
x_trial = text_cleaner(new_desc)

In [93]:
x_vect = tfidf.transform([x_trial])

In [94]:
x_trial[:100]

'find next commerci real estat deal go leverag power loopnet anytim anywher find perfect invest prope'

In [95]:
dffff = pd.DataFrame(x_vect.toarray(), columns=tfidf.get_feature_names())

In [32]:
# ROC CURVE FOR MODELS AND OTHERS SHOWING PREFORMANCE, <- CANT DO... 
# TEXT RELATED VIZUALISATIONS LIKE WORD CLOUD, LATENT SEMANTIC ANALYSIS
# FEATURE IMPORTANCE ON RF, NB
# CREATE VIZ OF ENSEMBLE MODELS
# TYING SPECIFIC 

# Test-Train Split

In [33]:
y = df.labels

In [34]:
X = df.iloc[:,:-1]

In [35]:
# Sample code
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

# TPOT for automated model selection

In [50]:
from tpot import TPOTClassifier

In [51]:
config = {
    'sklearn.naive_bayes.MultinomialNB': {
        'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],
        'fit_prior': [True, False]
    },
    'sklearn.svm.LinearSVC' : {
        'C' : [1, 5, 10, 15, 20, 25],
        'dual' : [True, False],
        'loss' : ["squared_hinge", "hinge" ],
        'penalty' : ['l1', 'l2']
    },
    'sklearn.neighbors.KNeighborsClassifier' : {
    },
    'sklearn.ensemble.RandomForestClassifier' : {
        
    }
}

In [52]:
# Changed generation and population parameters to get better results
tpot = TPOTClassifier(generations=5, cv = 3 ,population_size=20,\
                      max_eval_time_mins=10, verbosity=3)

In [None]:
tpot.fit(X_train, y_train)

In [None]:
tpot.export('class4-pipeline.py')

In [None]:
# 0.8009259259259259 First pipeline trial
# 0.8055555555555556 Second pipeline trial
# 0.7685185185185185 Third Pipeline trial

In [41]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier


In [42]:
last = LinearSVC(C=10.0, dual=False, loss="squared_hinge", penalty="l2", tol=0.1)

In [None]:
last.fit(X_train, y_train)

In [None]:
last.predict(dffff)

In [None]:
svc1 = LinearSVC(C=25.0, dual=False, loss="squared_hinge", penalty="l2", tol=0.001)

In [None]:
svc2 = LinearSVC(C=1.0, dual=True, loss="hinge", penalty="l2", tol=1e-05)

In [None]:
gb_clf = GradientBoostingClassifier(learning_rate=0.01, max_depth=6, max_features=0.2, min_samples_leaf=3, min_samples_split=15, n_estimators=100, subsample=0.25)

# Voting Classifier

In [46]:
from sklearn.ensemble import VotingClassifier

In [47]:
model = VotingClassifier(estimators=[('svc1', svc1), 
                                     ('svc2', svc2),
                                     ('gb_clf', gb_clf),
                                    ('nb', nb_classifier),
                                    ('knn', knn),
                                    ('rf', rf_classifier)], voting='hard')

NameError: name 'svc1' is not defined

In [None]:
import joblib

In [None]:
# model1 with: svc1, svc2, last, gb_clf, nb, knn, rf
# model2 with: svc1, svc2, gb_clf, nb, knn, rf
# last -> SVC model 
#         LinearSVC(C=10.0, dual=False, loss="squared_hinge", penalty="l2", tol=0.1)

model.fit(X_train, y_train)

# model1 => 0.8101851851851852 <- voting
# model2 => 0.8148148148148148 <-voting
# last => 0.8194444444444444 <-svc

In [None]:
# To Store Models:
# joblib.dump(model, 'model2')


# To load model
# model1 = joblib.load('model1')

# Random Forest and others

In [97]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, f1_score

In [98]:
knn = KNeighborsClassifier()

# Naive-Bayes Multinomial Classifier

In [99]:
nb_classifier = MultinomialNB()

In [100]:
nb_classifier.fit(X_train, y_train)
nb_train_preds = nb_classifier.predict(X_train)
nb_test_preds = nb_classifier.predict(X_test)

nb_train_score = accuracy_score(y_train, nb_train_preds)
nb_test_score = accuracy_score(y_test, nb_test_preds)

print("Multinomial Naive Bayes")
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(nb_train_score, nb_test_score))

Multinomial Naive Bayes
Training Accuracy: 0.9225 		 Testing Accuracy: 0.7454


# Random Forest Classifier

In [102]:
rf_classifier = RandomForestClassifier(n_estimators=50)

rf_classifier.fit(X_train, y_train)
rf_train_preds = rf_classifier.predict(X_train)
rf_test_preds = rf_classifier.predict(X_test)

rf_train_score = accuracy_score(y_train, rf_train_preds)
rf_test_score = accuracy_score(y_test, rf_test_preds)

print('Random Forest')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))

Random Forest
Training Accuracy: 1.0 		 Testing Accuracy: 0.7454


# PCA experiment

In [53]:
from sklearn.decomposition import PCA

In [54]:
pca = PCA(.90)

In [57]:
pca.n_components_

565

In [55]:
new_train = pca.fit_transform(X_train)
new_test = pca.transform(X_test)

In [58]:
tpot.fit(new_train, y_train)

30 operators have been imported by TPOT.


HBox(children=(IntProgress(value=0, description='Optimization Progress', max=120, style=ProgressStyle(descript…



TPOT closed prematurely. Will use the current best pipeline.


RuntimeError: A pipeline has not yet been optimized. Please call fit() first.

In [None]:
tpot.export('pca2.py')

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
exported_pipeline = ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.5, min_samples_leaf=3, min_samples_split=7, n_estimators=100)

In [None]:
exported_pipeline.fit(new_train, y_train)

In [None]:
exported_pipeline.score(new_test, y_test)

In [43]:
last.fit(new_train, y_train)

LinearSVC(C=10.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.1,
     verbose=0)

In [44]:
last.score(new_test, y_test)

0.8194444444444444

# Pretty

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import itertools

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    #Add Normalization Option
    '''prints pretty confusion metric with normalization option '''
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
preds = last.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, preds)

In [None]:
cate = edu.keys()

In [None]:
plt.figure(figsize=(10,10))
plot_confusion_matrix(cm, cate)