In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE
from sklearn.metrics import classification_report
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier



In [2]:
dataFrame = pd.read_csv('C:/Users/shekh/Desktop/rentallisting/extrapolated_trained_data_new_1.csv');
df = pd.DataFrame(dataFrame);

In [3]:
df['interest_level'] = df['interest_level'].astype('category')
df['interest_level_codes'] = df['interest_level'].cat.codes

df['display_address'] = df['display_address'].astype('category')
df['display_address_codes'] = df['display_address'].cat.codes

df['latitude'] = df['latitude'].astype('category')
df['latitude_codes'] = df['latitude'].cat.codes

df['longitude'] = df['longitude'].astype('category')
df['longitude_codes'] = df['longitude'].cat.codes

In [4]:
featuresDF = df.drop('interest_level', axis=1)
featuresDF = featuresDF.drop('building_id', axis=1)
featuresDF = featuresDF.drop('created', axis=1)
featuresDF = featuresDF.drop('description', axis=1)
featuresDF = featuresDF.drop('display_address', axis=1)
featuresDF = featuresDF.drop('latitude', axis=1)
featuresDF = featuresDF.drop('longitude', axis=1)
featuresDF = featuresDF.drop('manager_id', axis=1)
featuresDF = featuresDF.drop('photos', axis=1)
featuresDF = featuresDF.drop('street_address', axis=1)
featuresDF = featuresDF.drop('listing_id', axis=1)

featuresDF.shape

(45342, 289)

In [5]:
#2. Using Variance Threshold for feature selection
selector = VarianceThreshold(threshold=(.9 * (1 - .9)));
temp = selector.fit_transform(featuresDF);

In [6]:
type(temp)

numpy.ndarray

In [7]:
temp.shape

(45342L, 22L)

In [8]:
selected_features = selector.get_support(indices=True)

In [9]:
reducedFeatureDF = featuresDF.iloc[:,selected_features]

In [10]:
reducedFeatureDF.columns

Index([u'bathrooms', u'bedrooms', u'no_of_photos', u'price', u'dish_washer',
       u'fitness_center', u'hardwood_floors', u'dining_room', u'elevator',
       u'doorman', u'dog_allowed', u'roof_deck', u'laundry', u'pre_war',
       u'cat_allowed', u'outdoor_space', u'laundry_bldg', u'no_fee',
       u'interest_level_codes', u'display_address_codes', u'latitude_codes',
       u'longitude_codes'],
      dtype='object')

In [11]:
y_label = reducedFeatureDF['interest_level_codes']

In [12]:
# X_train, X_test, y_train, y_test = train_test_split(reducedFeatureDF,y_label, test_size=0.40,random_state=15,stratify=y_label)
train, validate, test = np.split(reducedFeatureDF.sample(frac=1), [int(.8*len(df)), int(.9*len(df))])
print len(train);
print len(validate);
print len(test);
X_train = train.drop('interest_level_codes', axis=1)
X_validate = validate.drop('interest_level_codes', axis=1)
X_test = test.drop('interest_level_codes', axis=1)

y_train = train['interest_level_codes']
y_validate = validate['interest_level_codes']
y_test = test['interest_level_codes']

print len(X_train)
print len(X_validate)
print len(X_test)
print len(y_train)
print len(y_validate)
print len(y_test)

36273
4534
4535
36273
4534
4535
36273
4534
4535


In [13]:
print y_train.value_counts();
print y_validate.value_counts();
print y_test.value_counts();

1    25721
2     8022
0     2530
Name: interest_level_codes, dtype: int64
1    3227
2     972
0     335
Name: interest_level_codes, dtype: int64
1    3268
2     958
0     309
Name: interest_level_codes, dtype: int64


In [14]:
def generate_rf(X_train, y_train, X_validate, y_validate):
    rf = RandomForestClassifier(n_estimators=50, min_samples_leaf=10)
    rf.fit(X_train, y_train)
    print "rf score ", rf.score(X_validate, y_validate)
    return rf

def combine_rfs(rf_a, rf_b):
    rf_a.estimators_ += rf_b.estimators_
    rf_a.n_estimators = len(rf_a.estimators_)
    return rf_a

In [15]:
rfs = [generate_rf(X_train, y_train, X_validate, y_validate) for i in xrange(10)]
rf_combined = reduce(combine_rfs, rfs)

print "rf combined score", rf_combined.score(X_validate, y_validate)

rf score  0.741508601676
rf score  0.736876929863
rf score  0.742170269078
rf score  0.740626378474
rf score  0.737538597265
rf score  0.738641376268
rf score  0.739523599471
rf score  0.736656374063
rf score  0.737759153066
rf score  0.740185266873
rf combined score 0.736435818262


In [16]:
target_names = ['0', '1', '2'];

In [17]:
predicted_labels_validation = rf_combined.predict(X_validate)

In [18]:
print(classification_report(y_validate, predicted_labels_validation, target_names=target_names))

             precision    recall  f1-score   support

          0       0.69      0.15      0.24       335
          1       0.76      0.97      0.85      3227
          2       0.48      0.17      0.25       972

avg / total       0.69      0.74      0.68      4534



In [19]:
predicted_labels_test = rf_combined.predict(X_test)

In [20]:
print(classification_report(y_test, predicted_labels_test, target_names=target_names))

             precision    recall  f1-score   support

          0       0.62      0.10      0.17       309
          1       0.77      0.97      0.86      3268
          2       0.46      0.18      0.26       958

avg / total       0.69      0.74      0.68      4535

