# The project concerns basic Pandas, Numpy, Matplotlib, Natural Language Processing (NLP)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### Data Importing and Processing

In [129]:
dog_data = pd.read_table("dog_data.txt") #importing data 
dog_data.head() #shows us the first 5 rows of our data

Unnamed: 0,sequence,class
0,ATGCCACAGCTAGATACATCCACCTGATTTATTATAATCTTTTCAA...,4
1,ATGAACGAAAATCTATTCGCTTCTTTCGCTGCCCCCTCAATAATAG...,4
2,ATGGAAACACCCTTCTACGGCGATGAGGCGCTGAGCGGCCTGGGCG...,6
3,ATGTGCACTAAAATGGAACAGCCCTTCTACCACGACGACTCATACG...,6
4,ATGAGCCGGCAGCTAAACAGAAGCCAGAACTGCTCCTTCAGTGACG...,0


In [131]:
dog_data['class'].value_counts() #shows us how many entries there are in each "class"

6    260
4    135
0    131
3     95
1     75
2     64
5     60
Name: class, dtype: int64

In [133]:
human_data.shape #shows of how many rows and columns our data has

(4380, 2)

In [134]:
# slow and inefficient way of obtainig k-mers
#i = 0
#k = 6
#j = 0
#words = []
#for j in range(4380):
    #while i < len(dog_data['sequence'])-k+1:
        #print(dog_data['sequence'][j][i:i+k]) #prints out the k-mers 
        #i = i + 1
    #j = j + 1

### Begin K-mer counting.

In [135]:
# more efficient k-mer counting function
def getKmers(sequence, size=6): #defines the function
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)] #returns our k-mers
dog_data['words'] = dog_data.apply(lambda x: getKmers(x['sequence']), axis=1) #creating a new column in human_data df 
dog_data.head() #see what our new df looks like                                 #called "words" for k-mers

Unnamed: 0,sequence,class,words
0,ATGCCACAGCTAGATACATCCACCTGATTTATTATAATCTTTTCAA...,4,"[atgcca, tgccac, gccaca, ccacag, cacagc, acagc..."
1,ATGAACGAAAATCTATTCGCTTCTTTCGCTGCCCCCTCAATAATAG...,4,"[atgaac, tgaacg, gaacga, aacgaa, acgaaa, cgaaa..."
2,ATGGAAACACCCTTCTACGGCGATGAGGCGCTGAGCGGCCTGGGCG...,6,"[atggaa, tggaaa, ggaaac, gaaaca, aaacac, aacac..."
3,ATGTGCACTAAAATGGAACAGCCCTTCTACCACGACGACTCATACG...,6,"[atgtgc, tgtgca, gtgcac, tgcact, gcacta, cacta..."
4,ATGAGCCGGCAGCTAAACAGAAGCCAGAACTGCTCCTTCAGTGACG...,0,"[atgagc, tgagcc, gagccg, agccgg, gccggc, ccggc..."


In [136]:
dog_data = dog_data.drop('sequence', axis=1) #drops 'sequence' column

In [137]:
dog_data.head()

Unnamed: 0,class,words
0,4,"[atgcca, tgccac, gccaca, ccacag, cacagc, acagc..."
1,4,"[atgaac, tgaacg, gaacga, aacgaa, acgaaa, cgaaa..."
2,6,"[atggaa, tggaaa, ggaaac, gaaaca, aaacac, aacac..."
3,6,"[atgtgc, tgtgca, gtgcac, tgcact, gcacta, cacta..."
4,0,"[atgagc, tgagcc, gagccg, agccgg, gccggc, ccggc..."


In [118]:
dog_texts = list(dog_data['words']) # converts all elements with the 'words' column into individual strings wherever 
                                        # a comma is present
for item in range(len(dog_texts)):    
    dog_texts[item] = ' '.join(dog_texts[item])
    #print(dog_texts)
y_dog_data = dog_data.iloc[:, 0].values #.values shows only the values not the column or row labels. Shows 'class' values   

In [120]:
y_dog_data

array([4, 4, 6, 6, 0, 6, 0, 0, 0, 0, 0, 6, 6, 0, 0, 0, 6, 0, 0, 0, 0, 0,
       6, 0, 0, 0, 6, 0, 0, 6, 0, 0, 0, 0, 0, 0, 3, 0, 6, 0, 6, 0, 6, 0,
       6, 0, 0, 3, 3, 6, 6, 6, 6, 4, 0, 0, 4, 2, 0, 0, 0, 0, 6, 0, 6, 0,
       0, 5, 5, 6, 4, 6, 0, 0, 3, 6, 3, 6, 6, 6, 6, 6, 4, 6, 4, 1, 4, 0,
       0, 0, 6, 4, 4, 6, 6, 6, 6, 6, 4, 6, 6, 3, 3, 3, 3, 0, 6, 3, 3, 6,
       4, 4, 5, 6, 6, 1, 5, 2, 2, 1, 4, 2, 6, 3, 3, 6, 6, 6, 3, 3, 6, 6,
       6, 3, 6, 6, 2, 0, 0, 6, 6, 3, 3, 3, 0, 0, 4, 6, 6, 5, 1, 1, 0, 0,
       0, 4, 0, 4, 0, 0, 3, 4, 6, 0, 6, 6, 3, 4, 0, 6, 6, 1, 2, 2, 2, 3,
       6, 2, 2, 2, 6, 0, 0, 6, 6, 0, 0, 1, 6, 5, 5, 6, 0, 6, 0, 0, 4, 0,
       0, 3, 4, 0, 0, 6, 6, 5, 5, 3, 4, 1, 0, 6, 5, 6, 6, 4, 4, 2, 2, 2,
       4, 6, 4, 4, 2, 1, 3, 6, 0, 6, 2, 3, 0, 0, 6, 0, 6, 4, 3, 1, 1, 4,
       6, 4, 4, 6, 3, 3, 2, 2, 1, 1, 3, 3, 4, 3, 0, 3, 5, 5, 5, 5, 5, 5,
       5, 2, 2, 4, 4, 6, 4, 4, 6, 6, 6, 0, 3, 3, 4, 3, 0, 0, 6, 0, 0, 6,
       6, 1, 5, 5, 0, 0, 6, 6, 1, 1, 1, 4, 2, 2, 3,

## An aside here. Below is an example review of how Pandas Dataframe indexing work.

In [121]:
#example = [{'a': 1, 'b': 2, 'c': 3, 'd': 4},
          #{'a': 100, 'b': 200, 'c': 300, 'd': 400},
          #{'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }]
#df = pd.DataFrame(example)
#df

In [122]:
#df.iloc[0:2, 0:3] # colon(:) means from 0th index to 2nd index, 1st set is rows, 2nd set is columns

In [123]:
#df.iloc[[0,2] [0:3]] # comma(,) means 0th index and 2nd index

## Back to the DNA Sequence Classifier
   ### Since our k-mer resemble text data, we will employ bag of words from natural language processing.

In [124]:
print(dog_texts[0]) #shows the first row

atgcca tgccac gccaca ccacag cacagc acagct cagcta agctag gctaga ctagat tagata agatac gataca atacat tacatc acatcc catcca atccac tccacc ccacct cacctg acctga cctgat ctgatt tgattt gattta atttat tttatt ttatta tattat attata ttataa tataat ataatc taatct aatctt atcttt tctttt cttttc ttttca tttcaa ttcaat tcaata caatat aatatt atattt tatttc atttct tttctc ttctca tctcac ctcacc tcaccc caccct accctc ccctct cctctt ctcttc tcttca cttcat ttcatc tcatcc catcct atccta tcctat cctatt ctattt tatttc atttca tttcaa ttcaac tcaact caacta aactaa actaaa ctaaaa taaaaa aaaaat aaaatt aaattt aatttc atttca tttcaa ttcaaa tcaaat caaatc aaatca aatcac atcact tcacta cactac actact ctacta tactac actacc ctaccc taccca acccag cccaga ccagaa cagaaa agaaaa gaaaac aaaacc aaaccc aacccg acccga cccgat ccgata cgataa gataac ataacc taacca aaccaa accaaa ccaaat caaatc aaatct aatctg atctgc tctgct ctgcta tgctaa gctaaa ctaaaa taaaat aaaatt aaattg aattgc attgct ttgctg tgctgg gctggt ctggtc tggtca ggtcaa gtcaac tcaaca caacat aacata acataa cataat ataatc

In [139]:
# Creating the Bag of Words model using CountVectorizer()
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_dog = cv.fit_transform(dog_texts)

In [140]:
#print(X_human.shape)
#print(X_chimp.shape)
print(X_dog.shape)

(820, 4186)


In [141]:
#human_data['class'].value_counts().sort_index().plot.bar()

In [147]:
# Splitting the human dataset into the training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_dog, y_data, test_size = 0.20, random_state=22) #creating our traning 
                                                                                                        #split, 80/20 ratio

ValueError: Found input variables with inconsistent numbers of samples: [820, 4380]

In [148]:
print(X_train.shape)
print(X_test.shape)

(3504, 4469)
(876, 4469)


In [144]:
### Multinomial Naive Bayes Classifier ###
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha=0.01)
classifier.fit(X_train, y_train)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [145]:
y_pred = classifier.predict(X_test)

In [146]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
print("Confusion matrix\n")
print(pd.crosstab(pd.Series(y_test, name='Actual'), pd.Series(y_pred, name='Predicted')))
def get_metrics(y_test, y_predicted):
    accuracy = accuracy_score(y_test, y_predicted)
    precision = precision_score(y_test, y_predicted, average='weighted')
    recall = recall_score(y_test, y_predicted, average='weighted')
    f1 = f1_score(y_test, y_predicted, average='weighted')
    return accuracy, precision, recall, f1
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))

Confusion matrix

Predicted   0   1   2   3   4   5    6
Actual                                
0          63  13  11   1  12   1    1
1           5  77  16   2   2   0    4
2           1  11  47   1  11   0    7
3           4   6  14  81  14   0    6
4           8   8  25   5  95   1    7
5           2   3  17   0   2  25    2
6           6  25  59   5  12   0  158
accuracy = 0.623 
precision = 0.713 
recall = 0.623 
f1 = 0.647


## Very special thanks and recognition to Kaggle for providing the data and Krish Naik's YouTube tutorials.