In [1]:
import pandas as pd
import sklearn
import re
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn import svm
from sklearn.metrics import precision_recall_fscore_support as prfs
from sklearn.metrics import classification_report
from nltk.stem.porter import PorterStemmer as porterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from nltk.corpus import stopwords



In [2]:
stopwords_set = set(stopwords.words('english'));

In [3]:
def cleanText(inputText):
    tempText = inputText;
    tempText = re.sub('<[^<]+?>', ' ', tempText);
    tempText = re.sub(r'&amp[;]?', r' ', tempText);
    tempText = re.sub(r'\'s', ' ', tempText);
    tempText = re.sub(r'[\w\.-]+@[\w\.-]+', ' ', tempText);
    tempText = re.sub(r'[<>!#\[\]@/$:.,;%\()*?-]+', r' ', tempText);
    tempText = re.sub(r'\s+', r' ', tempText);
    words_filtered =[word.lower() for word in tempText.split()]
    cleanWordsList = [word for word in words_filtered if word not in stopwords_set]      #remove stopwords except few exceptions  
    cleanfeature = ' '.join(cleanWordsList)
    return cleanfeature

In [4]:
df = pd.read_json('C:/Users/shekh/Desktop/rentallisting/train.json')

In [5]:
df['display_address'] = df['display_address'].astype('category')
df['display_address_codes'] = df['display_address'].cat.codes

df['latitude'] = df['latitude'].astype('category')
df['latitude_codes'] = df['latitude'].cat.codes

df['longitude'] = df['longitude'].astype('category')
df['longitude_codes'] = df['longitude'].cat.codes

df['manager_id'] = df['manager_id'].astype('category')
df['manager_id_codes'] = df['manager_id'].cat.codes

df['interest_level'] = df['interest_level'].astype('category')
df['interest_level_codes'] = df['interest_level'].cat.codes

In [6]:
jointFeaturesDescDF = df[['description','features']];
label_column = df['interest_level_codes']
type(jointFeaturesDescDF)

pandas.core.frame.DataFrame

In [7]:
jointFeaturesDescList = []
for row in jointFeaturesDescDF.iterrows():
    tempList = row[1];
    description = tempList['description'];
    featureStr = ' '.join(tempList['features']);
    joinedStr = ' '.join([description,";",featureStr]);
    joinedStr = cleanText(joinedStr).encode('ascii', 'ignore').strip();
    jointFeaturesDescList.append(joinedStr);

In [8]:
print len(jointFeaturesDescList);

49352


In [9]:
jointFeaturesDescList[1]

'doorman elevator fitness center cats allowed dogs allowed'

In [10]:
label_column_list = label_column.tolist();

In [11]:
# Create feature vectors
vectorizer = TfidfVectorizer(min_df=0.00125,
                             max_df = 0.80,
                             sublinear_tf=True,
                             use_idf=True,
                             stop_words=u'english',
                             analyzer='word',
                             ngram_range=(1,3),lowercase=True)

In [12]:
totalVectors = vectorizer.fit_transform(jointFeaturesDescList);
print totalVectors.shape

(49352, 12780)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(totalVectors,label_column_list,test_size=0.25,random_state=15,stratify=label_column_list);

In [None]:
print X_train.shape
print X_test.shape

In [None]:
adaBoostClassifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),n_estimators=600,learning_rate=1)

In [None]:
adaBoostClassifier.fit(X_train,y_train)

In [None]:
predicted_labels = adaBoostClassifier.predict(X_test)

In [None]:
target_names = ['0', '1', '2']

In [None]:
actual_class_labels = y_test;
predicted_class_labels = predicted_labels.tolist();

In [None]:
print(classification_report(actual_class_labels, predicted_class_labels, target_names=target_names))

In [None]:
df.columns

In [None]:
featuresDF = df[['bathrooms','bedrooms','price','longitude_codes','latitude_codes','photos','display_address_codes','manager_id_codes']]

In [None]:
featuresDF.head(1)

In [None]:
new_features_df = pd.DataFrame();
type(new_features_df)
new_features_df.shape

In [None]:
for row in featuresDF.iterrows():
    row[1].set_value('no_of_photos',len((row[1])['photos']));
    new_features_df = new_features_df.append(row[1]);

In [None]:
new_features_df = new_features_df.drop('photos', axis=1)
new_features_df['bathrooms'] = new_features_df['bathrooms'].astype('category')
new_features_df['bedrooms'] = new_features_df['bedrooms'].astype('category')
new_features_df['latitude_codes'] = new_features_df['latitude_codes'].astype('category')
new_features_df['longitude_codes'] = new_features_df['longitude_codes'].astype('category')
new_features_df['manager_id_codes'] = new_features_df['manager_id_codes'].astype('category')
new_features_df['no_of_photos'] = new_features_df['no_of_photos'].astype('category')
new_features_df['price'] = new_features_df['price'].astype('category')

In [None]:
new_features_df.head(1)

In [None]:
label_df = pd.DataFrame(label_column_list);
new_features_train, new_features_test, new_features_y_train, new_features_y_test = train_test_split(new_features_df,label_df,test_size=0.25,random_state=15,stratify=label_column_list)
print len(new_features_train)
print len(new_features_test)
print len(new_features_y_train)
print len(new_features_y_test)
print new_features_train.shape
print new_features_y_train.shape

In [None]:
print new_features_train.head(1)

In [None]:
svmRBFKernelClassifier = SVC();
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
svmRBFKernelClassifier.fit(new_features_train, new_features_y_train)

In [None]:
predict_lab = svmRBFKernelClassifier.predict(new_features_test)

In [None]:
new_features_actual_class_labels = new_features_y_test;
new_features_predicted_class_labels = predict_lab.tolist();

In [None]:
print(classification_report(new_features_actual_class_labels, new_features_predicted_class_labels, target_names=target_names))