# Importing packages

In [36]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [37]:
datapath = "../input/kc1_data.txt"

# Assign Column Names

In [38]:
df=pd.read_csv(datapath,sep=",",header=None,
                 names=['log','v(g)','ev(g)','iv(g)'
                        ,'n','v','l','d','i','e','b'
                        ,'t','10Code','10Comment','10Blank'
                        ,'10CodeAndComment','uniq_op','uniq_Opnd'
                        ,'total_op','total_Opnd','branchCount'
                        ,'problems'],encoding = 'latin')

In [39]:
df.head()

# Data Pre-Processing

In [40]:
df.columns

We have 21 independent variables and 1 target variable, i.e. problems in the dataset.

# Print the data types

In [41]:
df.dtypes

# Data Shape

In [42]:
df.shape

## Missing Values

In [43]:
df.isnull().sum()

No missing value.

In [44]:
X =df.drop(["problems"],axis=1)

In [45]:
X.head()

In [46]:
y = df[["problems"]]

In [47]:
y.head()

# Removing Outliers

In [48]:
columns = ['log', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e', 'b', 't','10Code', '10Comment', '10Blank', '10CodeAndComment', 'uniq_op','uniq_Opnd', 'total_op', 'total_Opnd', 'branchCount']

for column in columns:
    plt.figure(figsize=(16,5))
    plt.subplot(1,2,1)
    sns.distplot(df[column])
    plt.show()


In [49]:
print("Highest allowed",df['log'].mean() + 3*df['log'].std())
print("Lowest allowed",df['log'].mean() - 3*df['log'].std())

In [50]:
df[(df['log'] > 8.80) | (df['log'] < 5.11)]

In [51]:
df = df[(df['log'] < 8.80) & (df['log'] > 5.11)]
upper_limit = df['log'].mean() + 3*df['log'].std()
lower_limit = df['log'].mean() - 3*df['log'].std()

In [52]:
df['log'] = np.where(
    df['log']>upper_limit,
    upper_limit,
    np.where(
        df['log']<lower_limit,
        lower_limit,
        df['log']
    )
)

df['log'].describe()

# Split the data

In [53]:
from sklearn.model_selection import train_test_split

In [54]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1)

The dataset has been divided into training and validation part.

70% data will use for train the model and rest of the 30% data will use for test the model.

# Size of train data

In [55]:
x_train.shape  , y_train.shape

# Size of test data

In [56]:
x_test.shape , y_test.shape

# Applying SVM Classifier

In [57]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [58]:
svc_model = SVC()

In [59]:
svc_model.fit(x_train,y_train)

In [60]:
svc_pred = svc_model.predict(x_test)


In [61]:

svc_score = accuracy_score(svc_pred,y_test)*100

In [62]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)

In [63]:
from sklearn.svm import SVC
classifier = SVC(kernel='rbf', random_state = 1)
classifier.fit(x_train,y_train)

In [64]:
# Get support vector indices
support_vector_indices = classifier.support_
print(support_vector_indices)

In [65]:
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, svc_pred))

# Applying Naive Bayes Classifier

In [66]:
from sklearn.naive_bayes import GaussianNB

In [67]:
naive_bayes_model = GaussianNB()

In [68]:
naive_bayes_model.fit(x_train,y_train)

In [69]:
naive_bayes_pred = naive_bayes_model.predict(x_test)

In [None]:
naive_bayes_score = accuracy_score(naive_bayes_pred,y_test)*100

In [None]:
naive_bayes_score

After applying Naive Bayes our predictions are almost 83% accurate, i.e. we have identified 83% of the problems correctly for our Naive Bayes classifier model

# Applying SVM Classifier with K fold cross validation

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [None]:
k_fold = KFold(len(df), shuffle=True, random_state=0)

In [None]:
svc_cv_model = SVC()

In [None]:
svc_cv_score = cross_val_score(svc_cv_model,X,y,cv=k_fold,scoring = 'accuracy')*100

In [None]:
svc_cv_score

In [None]:
svc_cv_score.mean()

After applying k fold cross validatin our predictions are almost 85% accurate, i.e. we have identified 85% of the problems correctly for our SVM classifier model.

# Applying Naive Bayes Classifier with K fold cross validation

In [None]:
naive_bayes_cv_model = GaussianNB()

In [None]:
naive_bayes_cv_score = cross_val_score(naive_bayes_cv_model,X,y,cv=k_fold,scoring = 'accuracy')*100

In [None]:
naive_bayes_cv_score

In [None]:
naive_bayes_cv_score.mean()

After applying k fold cross validatin our predictions are almost 82% accurate.

In [None]:
naive_bayes_cv_model.fit(X,y)

In [None]:
naive_bayes_cv_pred = naive_bayes_cv_model.predict(X)

In [None]:
naive_bayes_cv_score = accuracy_score(naive_bayes_cv_pred,y)*100

In [None]:
naive_bayes_cv_score

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree_model = DecisionTreeClassifier()

In [None]:
tree_cv_score = cross_val_score(tree_model,X,y,cv=k_fold,scoring = 'accuracy')*100

In [None]:
tree_cv_score

In [None]:
tree_cv_score.mean()

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logistic_model = LogisticRegression()

In [None]:
logistic_cv_score = cross_val_score(logistic_model,X,y,cv=k_fold,scoring = 'accuracy')*100

In [None]:
logistic_cv_score

In [None]:
logistic_cv_score.mean()

 86% accuracy

# Implement KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

set the value of k = 1 to 26

In [None]:
k_range = range(1,26)
scores = []
for k in k_range :
    KNN = KNeighborsClassifier(n_neighbors=k)
    KNN.fit(x_train,y_train)
    pred = KNN.predict(x_test)
    scores.append(accuracy_score(pred,y_test)*100)
    
print(pd.DataFrame(scores))

KNN score for train, test split method.

In [None]:
plt.plot(k_range,scores)
plt.xlabel("K for KNN")
plt.ylabel("Testing scores")
plt.show()

grapch of KNN for each K's value how testing score change

# KNN with k fold Cross Validation.

In [None]:
k_range = range(1,26)
scores = []
for k in k_range :
    KNN = KNeighborsClassifier(n_neighbors=k)
    KNN_cv_score = cross_val_score(KNN,X,y,cv=k_fold,scoring = 'accuracy')*100
    cv_score = scores.append(KNN_cv_score)
    
print(pd.DataFrame(scores))

KNN cross validation mean score

In [None]:
KNN_cv_score.mean()

with cross validion KNN score is 85%