In [1]:
#!/usr/local/bin/python3

import pandas as pd
from collections import defaultdict
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from heapq import nlargest
from math import log
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.cluster import KMeans
from sklearn import metrics
import string
import re
import nltk.classify.decisiontree
import nltk.classify.naivebayes
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.naive_bayes import BernoulliNB
import random
import nltk
from nltk.corpus import cmudict 
from nltk.tokenize import sent_tokenize, word_tokenize
import time

train_data = pd.read_json("/Users/shubhamkahal/Downloads/train.json").head(200)

# take out fields we don't need
del train_data['created']
del train_data['display_address']
del train_data['latitude']
del train_data['longitude']
del train_data['photos']
del train_data['street_address']

#test_data = pd.read_json("/Users/shubhamkahal/Downloads/test.json").head(200)

class WordFeatures:
    def __init__(self, min_cut=0.1, max_cut=0.9):
        # class constructor - takes in min and max cutoffs for 
        # frequency
        self._stopwords = set(stopwords.words('english') + list(punctuation) + [u"'s",'"'])

    def clean_html(self, raw_html):
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', raw_html)
        return cleantext

    def is_number(self, s):
        try:
            float(s)
            return True
        except ValueError:
            return False

    def get_number_of_syllables(self, word):
        if self.is_number(word):
            return 1

        d = cmudict.dict() 

        return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
    
word_features = WordFeatures()

def get_price_bins(price_list, bin_num):
    prices = []
    bins = []

    for price in price_list:
        prices.append(int(price))

    prices.sort()

    bin_size = (len(prices) // bin_num)
    counter_start = 0
    counter_end = 1

    while (counter_start < bin_num):
        bins.append(prices[(counter_start * bin_size) : (counter_end * bin_size)])

        counter_start += 1
        counter_end += 1

    return bins

def bin_price(price, price_bins):
    # price is gauranteed to be in a bin
    for bin in price_bins:
        if price >= bin[0] and price <= bin[-1]:
            return sum(bin) / len(bin)

def get_uppercase_lowercase_ratio(description):
    if len(description) == 0:
        return {"uppercase_ratio" : 0, "lowercase_ratio" : 0}

    words = description.split(' ')
    upper_count = 0
    lower_count = 0

    for word in words:
        for letter in word:
            if letter.isupper():
                upper_count += 1
            else:
                lower_count += 1

    return {"uppercase_ratio" : (upper_count / len(description)), "lowercase_ratio" : (lower_count / len(description))}

def get_completeness_score(row, columns):
    completeness_score = 0

    for column in columns:
        if isinstance(row[column], (int, float, complex)) or len(row[column]) > 0:
            completeness_score += 1

    return completeness_score

def get_fog_index(description):
    if (len(description) == 0):
        return 0

    sentences = sent_tokenize(description)
    words = word_tokenize(description)
    complex_word_count = 0
    simple_word_count = 0

    for word in words:
        num_syllables = word_features.get_number_of_syllables(word)
        print(word, "num_syllables:", num_syllables)
        if num_syllables >= 3:
            complex_word_count += 1
        else:
            simple_word_count += 1

    return 0.4 * ((len(words) / len(sentences)) + (100 * (complex_word_count / simple_word_count)))

def get_record_list(data, all_strings, price_list, all_trigrams, manager_to_unique_interest_levels, building_to_unique_interest_levels, bathrooms_to_unique_interest_levels):
    price_bins = get_price_bins(price_list, 5)

    features = defaultdict(None)
    record_list = []
    all_features = []
    all_labels = []
    columns = list(data.columns.values)

    for index, row in data.iterrows():

        features['bathrooms'] = row['bathrooms']
        features['bedrooms'] = row['bedrooms']
        features['price'] = bin_price(row['price'], price_bins)
        features['manager_id'] = row['manager_id']
        features['building_id'] = row['building_id']

        # uppercase_lowercase_ratio = get_uppercase_lowercase_ratio(row['description'])

        # features['uppercase_ratio'] = uppercase_lowercase_ratio['uppercase_ratio']
        # features['lowercase_ratio'] = uppercase_lowercase_ratio['lowercase_ratio']

        # features['fog_index'] = get_fog_index(row['description'])
        
        features['completeness_score'] = get_completeness_score(row, columns)

        if (len(row['description']) > 0):
            features['text_richness'] = len(set(row['description'])) / len(row['description'])
        else:
            features['text_richness'] = 0
        
        if (len(manager_to_unique_interest_levels[features['manager_id']]) == 1):
            record_list.append((features.copy(), manager_to_unique_interest_levels[features['manager_id']][0]))
        else:
            record_list.append((features.copy(), row['interest_level']))
        
        all_features.append(features.copy())
        all_labels.append(row['interest_level'])
        
        if (len(building_to_unique_interest_levels[features['building_id']]) == 1):
            record_list.append((features.copy(), building_to_unique_interest_levels[features['building_id']][0]))
        else:
            record_list.append((features.copy(), row['interest_level']))
            
        all_features.append(features.copy())
        all_labels.append(row['interest_level'])
            
        if (len(bathrooms_to_unique_interest_levels[features['bathrooms']]) == 1):
            record_list.append((features.copy(), bathrooms_to_unique_interest_levels[features['bathrooms']][0]))
        else:
            record_list.append((features.copy(), row['interest_level']))
            
        all_features.append(features.copy())
        all_labels.append(row['interest_level'])
            
        features = defaultdict(None)

    return record_list, all_features, all_labels

def main():
    start_time = time.time()
    
    manager_to_unique_interest_levels = train_data.groupby('manager_id')["interest_level"].unique().to_dict()
    
    building_to_unique_interest_levels = train_data.groupby('building_id')["interest_level"].unique().to_dict()
    
    bathrooms_to_unique_interest_levels = train_data.groupby('bathrooms')["interest_level"].unique().to_dict()

    all_strings = []
    price_list = []
    all_trigrams = set()

    for index, row in train_data.iterrows():
        price_list.append(row['price'])
        strings = [word for word in word_features.clean_html(row["description"].lower()).split(' ') if word not in word_features._stopwords]
        all_strings.append(strings.copy())
    
    record_list, all_features, y = get_record_list(train_data, all_strings, price_list, all_trigrams, manager_to_unique_interest_levels, building_to_unique_interest_levels, bathrooms_to_unique_interest_levels)
    
    '''
    all_strings = []
    price_list = []
    all_trigrams = set()

    for index, row in test_data.iterrows():
        price_list.append(row['price'])
        strings = [word for word in word_features.clean_html(row["description"].lower()).split(' ') if word not in word_features._stopwords]
        all_strings.append(strings.copy())

    test_set = get_record_list(test_data, all_strings, price_list, all_trigrams)
    '''
    
    random.shuffle(record_list)
    train_set = record_list[(len(record_list)//3):]
    test_set = record_list[0:(len(record_list)//3)]
    
    vec = DictVectorizer()
    X = vec.fit_transform(all_features).toarray()
    
    X_train = X[(len(X)//3):]
    X_test = X[0:(len(X)//3)]

    y_train = y[(len(y)//3):]
    y_test = y[0:(len(y)//3)]
    
    print("--- generating record_list: %s seconds ---" % (time.time() - start_time))

    target = open("result.txt", 'w')

    target.truncate()

    start_time = time.time()

    classifier = nltk.DecisionTreeClassifier.train(train_set)
    target.write("Decision Tree")
    target.write("\n")
    decision_tree_accuracy = nltk.classify.accuracy(classifier, test_set)
    target.write(str(decision_tree_accuracy))
    target.write("\n")
    
    print("Decision Tree", decision_tree_accuracy)

    classifier = nltk.NaiveBayesClassifier.train(train_set)
    target.write("Naive Bayes")
    target.write("\n")
    naive_bayes_accuracy = nltk.classify.accuracy(classifier, test_set)
    target.write(str(naive_bayes_accuracy))
    target.write("\n")

    print("Naive Bayes", naive_bayes_accuracy)

    classifier = RandomForestClassifier(n_estimators=20)
    classifier = classifier.fit(X_train, y_train)
    target.write("Random Forest")
    target.write("\n")
    random_forest_accuracy = classifier.score(X_test, y_test)
    print("Random Forest", classifier.score(X_test, y_test))
    target.write(str(random_forest_accuracy))
    target.write("\n")
    
    classifier = PassiveAggressiveClassifier(loss='squared_hinge', C=1.0)
    classifier = classifier.fit(X_train, y_train)
    target.write("Passive Aggressive")
    target.write("\n")
    passive_aggressive_accuracy = classifier.score(X_test, y_test)
    print("Passive Aggressive Classifier", passive_aggressive_accuracy)
    target.write(str(passive_aggressive_accuracy))
    target.write("\n")
    
    classifier = KNeighborsClassifier(n_neighbors=3)
    classifier = classifier.fit(X_train, y_train)
    target.write("K Nearest Neighbors")
    target.write("\n")
    knneighbors_accuracy = classifier.score(X_test, y_test)
    print("K Nearest Neighbors Classifier", knneighbors_accuracy)
    target.write(str(knneighbors_accuracy))
    target.write("\n")
    
    classifier = SGDClassifier(alpha=.0001, n_iter=50)
    classifier = classifier.fit(X_train, y_train)
    target.write("SGD")
    target.write("\n")
    sgd_accuracy = classifier.score(X_test, y_test)
    print("SGD Classifier", sgd_accuracy)
    target.write(str(sgd_accuracy))
    target.write("\n")
    
    classifier = Perceptron(n_iter=50)
    classifier = classifier.fit(X_train, y_train)
    target.write("Perceptron")
    target.write("\n")
    perceptron_accuracy = classifier.score(X_test, y_test)
    print("Perceptron", perceptron_accuracy)
    target.write(str(perceptron_accuracy))
    target.write("\n")
    
    classifier = RidgeClassifier(tol=1e-2, solver="lsqr")
    classifier = classifier.fit(X_train, y_train)
    target.write("Ridge")
    target.write("\n")
    ridge_accuracy = classifier.score(X_test, y_test)
    print("Ridge Classifier", ridge_accuracy)
    target.write(str(ridge_accuracy))
    target.write("\n")
    
    classifier = NearestCentroid()
    classifier = classifier.fit(X_train, y_train)
    target.write("Nearest Centroid")
    target.write("\n")
    nearest_centroid_accuracy = classifier.score(X_test, y_test)
    print("Nearest Centroid", nearest_centroid_accuracy)
    target.write(str(nearest_centroid_accuracy))
    target.write("\n")
    
    classifier = BernoulliNB(alpha=.01)
    classifier = classifier.fit(X_train, y_train)
    target.write("BernoulliNB")
    target.write("\n")
    bernoulli_nb_accuracy = classifier.score(X_test, y_test)
    print("BernoulliNB", bernoulli_nb_accuracy)
    target.write(str(bernoulli_nb_accuracy))
    
    print("--- training/classification: %s seconds ---" % (time.time() - start_time))

    target.close()

if __name__ == '__main__':
    main()


--- generating record_list: 0.18744802474975586 seconds ---
Decision Tree 1.0
Naive Bayes 0.985
Random Forest 0.67
Passive Aggressive Classifier 0.285
K Nearest Neighbors Classifier 0.55
SGD Classifier 0.64
Perceptron 0.64
Ridge Classifier 0.64
Nearest Centroid 0.55
BernoulliNB 0.37
--- training/classification: 0.2437608242034912 seconds ---
