# Importing packages

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [2]:
datapath = "../input/kc1_data.txt"

# Assign Column Names

In [3]:
df=pd.read_csv(datapath,sep=",",header=None,
                 names=['log','v(g)','ev(g)','iv(g)'
                        ,'n','v','l','d','i','e','b'
                        ,'t','10Code','10Comment','10Blank'
                        ,'10CodeAndComment','uniq_op','uniq_Opnd'
                        ,'total_op','total_Opnd','branchCount'
                        ,'problems'],encoding = 'latin')

In [4]:
df.head()

# Data Pre-Processing

In [5]:
df.columns

We have 21 independent variables and 1 target variable, i.e. problems in the dataset.

# Print the data types

In [6]:
df.dtypes

# Data Shape

In [7]:
df.shape

## Missing Values

In [8]:
df.isnull().sum()

No missing value.

In [9]:
X =df.drop(["problems"],axis=1)

In [10]:
X.head()

In [11]:
y = df[["problems"]]

In [12]:
y.head()

# Removing Outliers

In [13]:
columns = ['log', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e', 'b', 't','10Code', '10Comment', '10Blank', '10CodeAndComment', 'uniq_op','uniq_Opnd', 'total_op', 'total_Opnd', 'branchCount']

for column in columns:
    plt.figure(figsize=(16,5))
    plt.subplot(1,2,1)
    sns.distplot(df[column])
    plt.show()


In [14]:
print("Highest allowed",df['log'].mean() + 3*df['log'].std())
print("Lowest allowed",df['log'].mean() - 3*df['log'].std())

In [15]:
df[(df['log'] > 8.80) | (df['log'] < 5.11)]

In [16]:
df = df[(df['log'] < 8.80) & (df['log'] > 5.11)]
upper_limit = df['log'].mean() + 3*df['log'].std()
lower_limit = df['log'].mean() - 3*df['log'].std()

In [17]:
df['log'] = np.where(
    df['log']>upper_limit,
    upper_limit,
    np.where(
        df['log']<lower_limit,
        lower_limit,
        df['log']
    )
)

df['log'].describe()

# Split the data

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1)

The dataset has been divided into training and validation part.

70% data will use for train the model and rest of the 30% data will use for test the model.

# Size of train data

In [20]:
x_train.shape  , y_train.shape

# Size of test data

In [21]:
x_test.shape , y_test.shape

# Applying SVM Classifier

In [22]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [23]:
svc_model = SVC()

In [24]:
svc_model.fit(x_train,y_train)

In [25]:
svc_pred = svc_model.predict(x_test)

In [26]:
svc_score = accuracy_score(svc_pred,y_test)*100

In [27]:
svc_score

# Applying Naive Bayes Classifier

In [28]:
from sklearn.naive_bayes import GaussianNB

In [29]:
naive_bayes_model = GaussianNB()

In [30]:
naive_bayes_model.fit(x_train,y_train)

In [31]:
naive_bayes_pred = naive_bayes_model.predict(x_test)

In [32]:
naive_bayes_score = accuracy_score(naive_bayes_pred,y_test)*100

In [33]:
naive_bayes_score

After applying Naive Bayes our predictions are almost 83% accurate, i.e. we have identified 83% of the problems correctly for our Naive Bayes classifier model

# Applying SVM Classifier with K fold cross validation

In [34]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [35]:
k_fold = KFold(len(df), shuffle=True, random_state=0)

In [36]:
svc_cv_model = SVC()

In [37]:
svc_cv_score = cross_val_score(svc_cv_model,X,y,cv=k_fold,scoring = 'accuracy')*100

In [38]:
svc_cv_score

In [39]:
svc_cv_score.mean()

After applying k fold cross validatin our predictions are almost 85% accurate, i.e. we have identified 85% of the problems correctly for our SVM classifier model.

# Applying Naive Bayes Classifier with K fold cross validation

In [40]:
naive_bayes_cv_model = GaussianNB()

In [41]:
naive_bayes_cv_score = cross_val_score(naive_bayes_cv_model,X,y,cv=k_fold,scoring = 'accuracy')*100

In [42]:
naive_bayes_cv_score

In [43]:
naive_bayes_cv_score.mean()

After applying k fold cross validatin our predictions are almost 82% accurate.

In [44]:
naive_bayes_cv_model.fit(X,y)

In [45]:
naive_bayes_cv_pred = naive_bayes_cv_model.predict(X)

In [46]:
naive_bayes_cv_score = accuracy_score(naive_bayes_cv_pred,y)*100

In [47]:
naive_bayes_cv_score

# Decision Tree

In [48]:
from sklearn.tree import DecisionTreeClassifier

In [49]:
tree_model = DecisionTreeClassifier()

In [50]:
tree_cv_score = cross_val_score(tree_model,X,y,cv=k_fold,scoring = 'accuracy')*100

In [51]:
tree_cv_score

In [52]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(x_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(x_test)

In [53]:
# Model Accuracy, how often is the classifier correct?
from sklearn.metrics import accuracy_score 
import sklearn.metrics as metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [54]:
#label encoding
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
target = df['log']
target = le.fit_transform(target)
target

In [55]:
y = target

In [56]:
# Splitting the data - 80:20 ratio
#x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)
print("Training split input- ", x_train.shape)
print("Testing split input- ", x_test.shape)

In [57]:
# Defining the decision tree algorithmdtree=DecisionTreeClassifier()
clf.fit(x_train,y_train)
print('Decision Tree Classifier Created')

In [58]:
# Predicting the values of test data
from sklearn.metrics import classification_report
y_pred = clf.predict(x_test)
print("Classification report - \n", classification_report(y_test,y_pred))

In [59]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,5))
sns.heatmap(data=cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy Score: {0}'.format(clf.score(x_test, y_test))
plt.title(all_sample_title, size = 15)

In [60]:
# Visualising the graph without the use of graphvizplt.figure(figsize = (20,20))
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import tree
fig= clf.fit(x_train,y_train)
#dec_tree = tree.plot_tree(decision_tree=clf, feature_names = df.columns, 
                     #class_names =["log", "l", "n"] , filled = True , precision = 4, rounded = True)
tree.plot_tree(fig)
plt.show()

In [None]:
tree_cv_score.mean()

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logistic_model = LogisticRegression()

In [None]:
logistic_cv_score = cross_val_score(logistic_model,X,y,cv=k_fold,scoring = 'accuracy')*100

In [None]:
logistic_cv_score

In [None]:
logistic_cv_score.mean()

 86% accuracy

# Implement KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

set the value of k = 1 to 26

In [None]:
k_range = range(1,26)
scores = []
for k in k_range :
    KNN = KNeighborsClassifier(n_neighbors=k)
    KNN.fit(x_train,y_train)
    pred = KNN.predict(x_test)
    scores.append(accuracy_score(pred,y_test)*100)
    
print(pd.DataFrame(scores))

KNN score for train, test split method.

In [None]:
plt.plot(k_range,scores)
plt.xlabel("K for KNN")
plt.ylabel("Testing scores")
plt.show()

grapch of KNN for each K's value how testing score change

# KNN with k fold Cross Validation.

In [None]:
k_range = range(1,26)
scores = []
for k in k_range :
    KNN = KNeighborsClassifier(n_neighbors=k)
    KNN_cv_score = cross_val_score(KNN,X,y,cv=k_fold,scoring = 'accuracy')*100
    cv_score = scores.append(KNN_cv_score)
    
print(pd.DataFrame(scores))

KNN cross validation mean score

In [None]:
KNN_cv_score.mean()

with cross validion KNN score is 85%

In [None]:
from sklearn.ensemble import RandomForestClassifier

# creating a RF classifier
clf = RandomForestClassifier(n_estimators = 100)

# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(x_train, y_train)

# performing predictions on the test dataset
y_pred = clf.predict(x_test)

In [None]:
# metrics are used to find accuracy or error
from sklearn import metrics
# using metrics module for accuracy calculation
print("Accuracy of model using Random Forest.:", metrics.accuracy_score(y_test, y_pred))
