In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb

In [3]:
df = pd.read_csv("data_semantics_training.csv")

In [4]:
df.drop(["serial_no"],axis=1,inplace=True)

In [5]:
categorical = df.select_dtypes(include=['object'])
cat_values=categorical.columns.values
for i in range(len(cat_values)):
    print('For the column "',cat_values[i],'" the unique values are')
    print(df[cat_values[i]].unique())

For the column " C7 " the unique values are
['Healthy' 'Defaulted' 'None' 'Unhealthy']
For the column " C8 " the unique values are
['Live' 'Closed' nan]


In [6]:
df=df.fillna(method='ffill')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 8 columns):
C1    699 non-null int64
C2    699 non-null int64
C3    699 non-null int64
C4    699 non-null int64
C5    699 non-null float64
C6    699 non-null float64
C7    699 non-null object
C8    699 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 43.8+ KB


In [8]:
y=df['C7']
y=pd.DataFrame(y)
X=df.drop(['C7'],axis=1)

In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 7 columns):
C1    699 non-null int64
C2    699 non-null int64
C3    699 non-null int64
C4    699 non-null int64
C5    699 non-null float64
C6    699 non-null float64
C8    699 non-null object
dtypes: float64(2), int64(4), object(1)
memory usage: 38.3+ KB


In [10]:
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 1 columns):
C7    699 non-null object
dtypes: object(1)
memory usage: 5.5+ KB


In [11]:
X=pd.get_dummies(X,columns = ['C8'])
y['C7'] = y['C7'].map({'Healthy':0, 'Unhealthy':1,'None':2,'Unhealthy':3})


In [12]:
X.head()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C8_Closed,C8_Live
0,1,1,15200,80,6.5,24.0,0,1
1,2,2,26100,90,6.75,30.0,1,0
2,3,3,22400,80,7.5,36.0,1,0
3,4,4,21600,90,7.5,42.0,1,0
4,5,5,44000,100,10.0,60.0,0,1


In [13]:
y=y.fillna(method='ffill')

In [14]:
y.describe()

Unnamed: 0,C7
count,699.0
mean,0.273247
std,0.761549
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,3.0


In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y,random_state=123)

In [17]:
alg = xgb.XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5,
                        min_child_weight=3, gamma=0.2, subsample=0.6, colsample_bytree=1.0,
                        objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27)

In [18]:
model = alg.fit(X,y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [19]:
pred1 = model.predict(X_test)

In [20]:
from sklearn.metrics import accuracy_score
print("Accuracy for model 1: %.2f" % (accuracy_score(y_test, pred1) * 100))

Accuracy for model 1: 98.57


In [40]:
model3 = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=250,
 max_depth=5,
 min_child_weight=1,
 gamma=0.1,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'multi:softmax',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

In [41]:
train_model3 = model3.fit(X, y)
pred3 = train_model3.predict(X_test)
print("Accuracy for model 3: %.2f" % (accuracy_score(y_test, pred3) * 100))

Accuracy for model 3: 100.00


In [36]:
from sklearn.model_selection import GridSearchCV

param_test = {
 'n_estimators':np.arange(250,2500,250)
}
gsearch = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softmax', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test,n_jobs=4,iid=False, cv=5)

train_model4 = gsearch.fit(X_train, y_train)
pred4 = train_model4.predict(X_test)
print("Accuracy for model 4: %.2f" % (accuracy_score(y_test, pred4) * 100))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy for model 4: 97.14


In [37]:
gsearch.best_params_

{'n_estimators': 250}

Predicting output

In [23]:
df2 = pd.read_csv("data_sematics_test.csv")

In [24]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 8 columns):
serial_no    301 non-null int64
C1           301 non-null int64
C2           301 non-null int64
C3           301 non-null int64
C4           301 non-null int64
C5           294 non-null float64
C6           294 non-null float64
C8           248 non-null object
dtypes: float64(2), int64(5), object(1)
memory usage: 18.9+ KB


In [25]:
df2.drop(['serial_no'],axis=1,inplace=True)

In [31]:
df2=df2.fillna(method = 'ffill')

In [32]:
df2=pd.get_dummies(df2,columns = ['C8'])

KeyError: "['C8'] not in index"

In [54]:
preds = clf.predict(df2)

In [55]:
_ = pd.read_csv("data_sematics_test.csv")
serial = _['serial_no']
data = { 'serial_no': serial, 'C7': preds}
submission = pd.DataFrame(data)
submission.head()

Unnamed: 0,serial_no,C7
0,1,0.0
1,2,2.0
2,3,0.0
3,4,0.0
4,5,0.0


In [56]:
submission['C7'] = submission['C7'].map({0:'Healthy', 1:'Unhealthy',2:'None',3:'Unhealthy'})


In [57]:
submission.to_csv("dectres.csv",index=False)

Linear SVC

In [1]:
from sklearn.svm import LinearSVC

In [21]:
clf = LinearSVC(random_state=0, tol=1e-5)

In [22]:
clf.fit(X,y)

  y = column_or_1d(y, warn=True)


LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=1e-05, verbose=0)

In [37]:
from sklearn.ensemble import RandomForestClassifier

In [39]:
clf = RandomForestClassifier(n_estimators=10)
clf = clf.fit(X, y)

  


In [44]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier


In [45]:
clf = ExtraTreesClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0)

In [46]:
clf.fit(X,y)

  """Entry point for launching an IPython kernel.


ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [52]:
clf = DecisionTreeClassifier(max_depth=None,min_samples_split=2, random_state=0)

In [53]:
clf.fit(X,y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')