In [1]:
import pandas as pd
import sklearn
import re
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn import svm
from sklearn.metrics import precision_recall_fscore_support as prfs
from sklearn.metrics import classification_report
from nltk.stem.porter import PorterStemmer as porterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from nltk.corpus import stopwords



In [2]:
stopwords_set = set(stopwords.words('english'));

In [3]:
def cleanText(inputText):
    tempText = inputText;
    tempText = re.sub('<[^<]+?>', ' ', tempText);
    tempText = re.sub(r'&amp[;]?', r' ', tempText);
    tempText = re.sub(r'\'s', ' ', tempText);
    tempText = re.sub(r'[\w\.-]+@[\w\.-]+', ' ', tempText);
    tempText = re.sub(r'[<>!#\[\]@/$:.,;%\()*?-]+', r' ', tempText);
    tempText = re.sub(r'\s+', r' ', tempText);
    words_filtered =[word.lower() for word in tempText.split()]
    cleanWordsList = [word for word in words_filtered if word not in stopwords_set]      #remove stopwords except few exceptions  
    cleanfeature = ' '.join(cleanWordsList)
    return cleanfeature

In [4]:
df = pd.read_json('C:/Users/shekh/Desktop/rentallisting/train.json')

In [5]:
df['display_address'] = df['display_address'].astype('category')
df['display_address_codes'] = df['display_address'].cat.codes

df['latitude'] = df['latitude'].astype('category')
df['latitude_codes'] = df['latitude'].cat.codes

df['longitude'] = df['longitude'].astype('category')
df['longitude_codes'] = df['longitude'].cat.codes

df['manager_id'] = df['manager_id'].astype('category')
df['manager_id_codes'] = df['manager_id'].cat.codes

df['interest_level'] = df['interest_level'].astype('category')
df['interest_level_codes'] = df['interest_level'].cat.codes

In [6]:
jointFeaturesDescDF = df[['description','features']];
label_column = df['interest_level_codes']
type(jointFeaturesDescDF)

pandas.core.frame.DataFrame

In [7]:
jointFeaturesDescList = []
for row in jointFeaturesDescDF.iterrows():
    tempList = row[1];
    description = tempList['description'];
    featureStr = ' '.join(tempList['features']);
    joinedStr = ' '.join([description,";",featureStr]);
    joinedStr = cleanText(joinedStr).encode('ascii', 'ignore').strip();
    jointFeaturesDescList.append(joinedStr);

In [8]:
print len(jointFeaturesDescList);

49352


In [9]:
jointFeaturesDescList[1]

'doorman elevator fitness center cats allowed dogs allowed'

In [10]:
label_column_list = label_column.tolist();

In [11]:
# Create feature vectors
vectorizer = TfidfVectorizer(min_df=0.00125,
                             max_df = 0.80,
                             sublinear_tf=True,
                             use_idf=True,
                             stop_words=u'english',
                             analyzer='word',
                             ngram_range=(1,3),lowercase=True)

In [12]:
totalVectors = vectorizer.fit_transform(jointFeaturesDescList);
print totalVectors.shape

(49352, 12780)


In [13]:
X_train, X_test, y_train, y_test = train_test_split(totalVectors,label_column_list,test_size=0.25,random_state=15,stratify=label_column_list);

In [14]:
print X_train.shape
print X_test.shape

(37014, 12780)
(12338, 12780)


In [15]:
adaBoostClassifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),n_estimators=600,learning_rate=1)

In [16]:
adaBoostClassifier.fit(X_train,y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1, n_estimators=600, random_state=None)

In [17]:
predicted_labels = adaBoostClassifier.predict(X_test)

In [18]:
target_names = ['0', '1', '2']

In [19]:
actual_class_labels = y_test;
predicted_class_labels = predicted_labels.tolist();

In [20]:
print(classification_report(actual_class_labels, predicted_class_labels, target_names=target_names))

             precision    recall  f1-score   support

          0       0.32      0.17      0.22       960
          1       0.77      0.81      0.79      8571
          2       0.35      0.35      0.35      2807

avg / total       0.64      0.66      0.65     12338



In [21]:
df.columns

Index([            u'bathrooms',              u'bedrooms',
                 u'building_id',               u'created',
                 u'description',       u'display_address',
                    u'features',        u'interest_level',
                    u'latitude',            u'listing_id',
                   u'longitude',            u'manager_id',
                      u'photos',                 u'price',
              u'street_address', u'display_address_codes',
              u'latitude_codes',       u'longitude_codes',
            u'manager_id_codes',  u'interest_level_codes'],
      dtype='object')

In [22]:
featuresDF = df[['bathrooms','bedrooms','price','longitude_codes','latitude_codes','photos','display_address_codes','manager_id_codes']]

In [23]:
featuresDF.head(1)

Unnamed: 0,bathrooms,bedrooms,price,longitude_codes,latitude_codes,photos,display_address_codes,manager_id_codes
10,1.5,3,3000,879,867,[https://photos.renthop.com/2/7211212_1ed4542e...,6544,1239


In [24]:
new_features_df = pd.DataFrame();
type(new_features_df)
new_features_df.shape

(0, 0)

In [25]:
for row in featuresDF.iterrows():
    row[1].set_value('no_of_photos',len((row[1])['photos']));
    new_features_df = new_features_df.append(row[1]);

In [41]:
new_features_df = new_features_df.drop('photos', axis=1)
new_features_df['bathrooms'] = new_features_df['bathrooms'].astype('category')
new_features_df['bedrooms'] = new_features_df['bedrooms'].astype('category')
new_features_df['latitude_codes'] = new_features_df['latitude_codes'].astype('category')
new_features_df['longitude_codes'] = new_features_df['longitude_codes'].astype('category')
new_features_df['manager_id_codes'] = new_features_df['manager_id_codes'].astype('category')
new_features_df['no_of_photos'] = new_features_df['no_of_photos'].astype('category')
new_features_df['price'] = new_features_df['price'].astype('category')

In [42]:
new_features_df.head(1)

Unnamed: 0,bathrooms,bedrooms,display_address_codes,latitude_codes,longitude_codes,manager_id_codes,no_of_photos,price
10,1.5,3.0,6544.0,867.0,879.0,1239.0,5.0,3000.0


In [43]:
label_df = pd.DataFrame(label_column_list);
new_features_train, new_features_test, new_features_y_train, new_features_y_test = train_test_split(new_features_df,label_df,test_size=0.25,random_state=15,stratify=label_column_list)
print len(new_features_train)
print len(new_features_test)
print len(new_features_y_train)
print len(new_features_y_test)
print new_features_train.shape
print new_features_y_train.shape

37014
12338
37014
12338
(37014, 8)
(37014, 1)


In [44]:
print new_features_train.head(1)

      bathrooms bedrooms  display_address_codes latitude_codes  \
99373       1.0      1.0                 5143.0         1401.0   

      longitude_codes manager_id_codes no_of_photos   price  
99373           760.0           3390.0          4.0  2100.0  


In [45]:
adaBoostClassifier_NumericCols = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=600,learning_rate=1)

In [46]:
adaBoostClassifier_NumericCols.fit(new_features_train,new_features_y_train)

  y = column_or_1d(y, warn=True)


AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1, n_estimators=600, random_state=None)

In [47]:
predict_lab = adaBoostClassifier_NumericCols.predict(new_features_test)

In [48]:
new_features_actual_class_labels = new_features_y_test;
new_features_predicted_class_labels = predict_lab.tolist();

In [49]:
print(classification_report(new_features_actual_class_labels, new_features_predicted_class_labels, target_names=target_names))

             precision    recall  f1-score   support

          0       0.44      0.30      0.36       960
          1       0.80      0.83      0.81      8571
          2       0.38      0.38      0.38      2807

avg / total       0.68      0.69      0.68     12338

