In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE
from sklearn.metrics import classification_report



In [2]:
dataFrame = pd.read_csv('C:/Users/shekh/Desktop/rentallisting/extrapolated_trained_data_new_1.csv');
df = pd.DataFrame(dataFrame);

In [3]:
df['interest_level'] = df['interest_level'].astype('category')
df['interest_level_codes'] = df['interest_level'].cat.codes

df['display_address'] = df['display_address'].astype('category')
df['display_address_codes'] = df['display_address'].cat.codes

df['latitude'] = df['latitude'].astype('category')
df['latitude_codes'] = df['latitude'].cat.codes

df['longitude'] = df['longitude'].astype('category')
df['longitude_codes'] = df['longitude'].cat.codes

In [4]:
featuresDF = df.drop('interest_level', axis=1)
featuresDF = featuresDF.drop('building_id', axis=1)
featuresDF = featuresDF.drop('created', axis=1)
featuresDF = featuresDF.drop('description', axis=1)
featuresDF = featuresDF.drop('display_address', axis=1)
featuresDF = featuresDF.drop('latitude', axis=1)
featuresDF = featuresDF.drop('longitude', axis=1)
featuresDF = featuresDF.drop('manager_id', axis=1)
featuresDF = featuresDF.drop('photos', axis=1)
featuresDF = featuresDF.drop('street_address', axis=1)
featuresDF = featuresDF.drop('listing_id', axis=1)

featuresDF.shape

(45342, 289)

In [5]:
# featuresDF.iloc[:,[0,1,2]]

In [6]:
# #1. Using RFE model and select 30 features
# rfe = RFE(lr_classifier, 30)
# rfe = rfe.fit(featuresDF, y_label)
# # summarize the selection of the attributes
# print(rfe.support_)
# print(rfe.ranking_)

In [7]:
#2. Using Variance Threshold for feature selection
selector = VarianceThreshold(threshold=(.9 * (1 - .9)));
temp = selector.fit_transform(featuresDF);

In [8]:
type(temp)

numpy.ndarray

In [9]:
temp.shape

(45342L, 22L)

In [10]:
selected_features = selector.get_support(indices=True)

In [11]:
reducedFeatureDF = featuresDF.iloc[:,selected_features]

In [12]:
reducedFeatureDF.columns

Index([u'bathrooms', u'bedrooms', u'no_of_photos', u'price', u'dish_washer',
       u'fitness_center', u'hardwood_floors', u'dining_room', u'elevator',
       u'doorman', u'dog_allowed', u'roof_deck', u'laundry', u'pre_war',
       u'cat_allowed', u'outdoor_space', u'laundry_bldg', u'no_fee',
       u'interest_level_codes', u'display_address_codes', u'latitude_codes',
       u'longitude_codes'],
      dtype='object')

In [13]:
y_label = reducedFeatureDF['interest_level_codes']

In [14]:
# X_train, X_test, y_train, y_test = train_test_split(reducedFeatureDF,y_label, test_size=0.40,random_state=15,stratify=y_label)
train, validate, test = np.split(reducedFeatureDF.sample(frac=1), [int(.8*len(df)), int(.9*len(df))])
print len(train);
print len(validate);
print len(test);
X_train = train.drop('interest_level_codes', axis=1)
X_validate = validate.drop('interest_level_codes', axis=1)
X_test = test.drop('interest_level_codes', axis=1)

y_train = train['interest_level_codes']
y_validate = validate['interest_level_codes']
y_test = test['interest_level_codes']

print len(X_train)
print len(X_validate)
print len(X_test)
print len(y_train)
print len(y_validate)
print len(y_test)

36273
4534
4535
36273
4534
4535
36273
4534
4535


In [15]:
print y_train.value_counts();
print y_validate.value_counts();
print y_test.value_counts();

1    25769
2     7992
0     2512
Name: interest_level_codes, dtype: int64
1    3201
2     998
0     335
Name: interest_level_codes, dtype: int64
1    3246
2     962
0     327
Name: interest_level_codes, dtype: int64


In [16]:
lr_classifier = LogisticRegression(C=1e5)
lr_classifier.fit(X_train,y_train)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [17]:
predict_labels_validation = lr_classifier.predict(X_validate)
target_names = ['0', '1', '2'];

In [18]:
print(classification_report(y_validate, predict_labels_validation, target_names=target_names))

             precision    recall  f1-score   support

          0       0.50      0.00      0.01       335
          1       0.73      0.98      0.83      3201
          2       0.42      0.10      0.16       998

avg / total       0.64      0.71      0.62      4534



In [19]:
predict_labels_test = lr_classifier.predict(X_test)

In [20]:
print(classification_report(y_test, predict_labels_test, target_names=target_names))

             precision    recall  f1-score   support

          0       1.00      0.00      0.01       327
          1       0.73      0.97      0.84      3246
          2       0.40      0.09      0.15       962

avg / total       0.68      0.72      0.63      4535

