# Data exploration

* This is a data exploration notebook. We aim to analyze the data to gain insight on the classification task that consists in predicting whether a particular DNA sequence exhibits favorable properties for bounding a protein.

## Imports

In [1]:
import csv
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Data loading

* We investigate directly the *raw* data (i.e. the sequences of *A*, *T*, *C* and *G*) rather than the intermediate representation obtained by using K-Means on sub-sequences.

In [2]:
CWD = os.getcwd()
DATA_DIR = os.path.join(CWD, "data")
RESULT_DIR = os.path.join(CWD, "results")

FILES = {0: {"train_mat": "Xtr0_mat100.csv",
             "train": "Xtr0.csv",
             "test_mat": "Xte0_mat100.csv",
             "test": "Xte0.csv",
             "label": "Ytr0.csv"},
         1: {"train_mat": "Xtr1_mat100.csv",
             "train": "Xtr1.csv",
             "test_mat": "Xte1_mat100.csv",
             "test": "Xte1.csv",
             "label": "Ytr1.csv"},
         2: {"train_mat": "Xtr2_mat100.csv",
             "train": "Xtr2.csv",
             "test_mat": "Xte2_mat100.csv",
             "test": "Xte2.csv",
             "label": "Ytr2.csv"}}

In [3]:
def load_data(file_id, mat=True):
    
    X_train = list()
    Y_train = list()
    X_test = list()
    
    dic = FILES[file_id]
    
    if mat:
        files = [dic["train_mat"], dic["label"], dic["test_mat"]]
    else:
        files = [dic["train"], dic["label"], dic["test"]]

    for file, l in zip(files, [X_train, Y_train, X_test]):
        with open(os.path.join(DATA_DIR, file), "r", newline="") as csvfile:
            if "mat" in file:
                reader = csv.reader(csvfile, delimiter=" ")
                for row in reader:
                    l.append(row)
            else:
                reader = csv.reader(csvfile, delimiter=",")
                next(reader, None) # Skip the header
                for row in reader:
                    l.append(row[1])
                
    if mat:
        X_train = np.array(X_train).astype("float")
        Y_train = np.array(Y_train).astype("int")
        X_test = np.array(X_test).astype("float")
        np.random.seed(0)
        index = np.random.permutation(len(X_train))
        X_train = X_train[index]
        Y_train = Y_train[index]
    
    else:
        np.random.seed(0)
        index = np.random.permutation(len(X_train))
        X_train = [X_train[i] for i in index]
        Y_train = [Y_train[i] for i in index]
        Y_train = np.array(Y_train).astype("int")
        
    
    return X_train, Y_train, X_test

In [99]:
# Instanciate dictionaries
TRAIN_DIC = {i: {"sequence": None, "label": None} for i in range(len(FILES))}
TEST_DIC = {i: {"sequence": None} for i in range(len(FILES))}

# Populate dictionaries
for i in range(len(FILES)):
    X_train, Y_train, X_test = load_data(i, mat=False)
    TRAIN_DIC[i]["sequence"] = X_train
    TRAIN_DIC[i]["label"] = Y_train
    TEST_DIC[i]["sequence"] = X_test

# Populate list of dictionaries to concatenate them below
train_dfs = list()
test_dfs = list()

for i in range(len(FILES)):
    tmp_train = pd.DataFrame(data=TRAIN_DIC[i])
    tmp_train["dataset"] = i # dataset index
    train_dfs.append(tmp_train)
    tmp_test = pd.DataFrame(data=TEST_DIC[i])
    tmp_test["dataset"] = i # dataset index
    test_dfs.append(tmp_test)

# Build pandas dataframe (training set)
train_df = pd.concat(train_dfs, ignore_index=True)
# Build pandas dataframe (testing set)
test_df = pd.concat(test_dfs, ignore_index=True)
train_df.head()

Unnamed: 0,sequence,label,dataset
0,TTTTTGGAGATGGAATTTCAATCTTGTTGCCCAGGGTGGAATGCAA...,0,0
1,TTTTTTTAGATGGAGTCTCACTCTTGTCGCCCAGGCTGGAGTGCAA...,0,0
2,GTACAACGGAGATAATCATCTGAGCTCTGTCTGCTTCCTCTGGCTA...,1,0
3,TTTCTAGACAGGAAAATTGAAACATATTAGCTTTATTCATGTATAG...,0,0
4,GAAGAATGGCGTGAACCTGGGAGGCAGAGGTTGCAGTGAGCCGAGA...,0,0


In [100]:
# TODO: Compute statistics on the strings

# Length of the strings
train_df["seq_length"] = train_df.sequence.str.len()

# Number of A, T, C, G
train_df["As"] = train_df.sequence.str.count("A")
train_df["Ts"] = train_df.sequence.str.count("T")
train_df["Cs"] = train_df.sequence.str.count("C")
train_df["Gs"] = train_df.sequence.str.count("G")
assert all(train_df.As + train_df.Ts + train_df.Cs + train_df.Gs == train_df.seq_length) # There are only A, T, C and G
train_df.head()

# TODO: intra and extra occurences (i.e. how many occurences we have for each sequence in its own dataset VS in all datasets)

# Extra occurences
u, inv, c = np.unique(train_df.sequence.values, return_inverse=True, return_counts=True)
train_df["unique_all"] = [c[inv[i]] for i in range(len(inv))]

# Assert that the sequence multiple occurences have the same label
doubles_df = train_df[train_df.unique_all > 1].sort_values(by="sequence")[["sequence", "label"]]

values = doubles_df.iloc[0].values
curr_seq = values[0]
curr_label = values[1]

for i in range(1, len(doubles_df)):
    values = doubles_df.iloc[i].values
    next_seq = values[0]
    next_label = values[1]
    if next_seq != curr_seq:
        continue
    else:
        if next_label != curr_label:
            raise ValueError(f"Duplicate ({next_seq}) has different labels! ")

# TODO: Intra occurences


# TODO: Longest straight sequence (letter and number)
for k, seq in enumerate(train_df.sequence.values):
    counts = {a: 0 for a in ["A", "T", "C", "G"]}
    curr_count = 0
    curr_char = seq[0]
    curr_count += 1
    for i in range(1, len(seq)):
        next_char = seq[i]
        if next_char != curr_char:
            if curr_count > counts[curr_char]:
                counts[curr_char] = curr_count
            # reset current count
            curr_count = 0
            curr_char = next_char
            curr_count += 1
        else:
            curr_count += 1
    train_df.loc[k, "A_longest"] = counts["A"]
    train_df.loc[k, "T_longest"] = counts["T"]
    train_df.loc[k, "C_longest"] = counts["C"]
    train_df.loc[k, "G_longest"] = counts["G"]
    train_df.loc[k, "Longest"] = max(counts.values())
    train_df.loc[k, "Longest_char"] = max(counts, key=counts.get)
    
train_df.head()

Unnamed: 0,sequence,label,dataset,seq_length,As,Ts,Cs,Gs,unique_all,A_longest,T_longest,C_longest,G_longest,Longest,Longest_char
0,TTTTTGGAGATGGAATTTCAATCTTGTTGCCCAGGGTGGAATGCAA...,0,0,101,23,27,23,28,1,2.0,5.0,3.0,3.0,5.0,T
1,TTTTTTTAGATGGAGTCTCACTCTTGTCGCCCAGGCTGGAGTGCAA...,0,0,101,18,31,30,22,1,2.0,7.0,3.0,3.0,7.0,T
2,GTACAACGGAGATAATCATCTGAGCTCTGTCTGCTTCCTCTGGCTA...,1,0,101,25,25,28,23,1,3.0,3.0,3.0,2.0,3.0,A
3,TTTCTAGACAGGAAAATTGAAACATATTAGCTTTATTCATGTATAG...,0,0,101,34,36,18,13,1,4.0,5.0,3.0,2.0,5.0,T
4,GAAGAATGGCGTGAACCTGGGAGGCAGAGGTTGCAGTGAGCCGAGA...,0,0,101,29,16,25,31,1,5.0,2.0,2.0,3.0,5.0,A


In [101]:
train_df.describe()

Unnamed: 0,label,dataset,seq_length,As,Ts,Cs,Gs,unique_all,A_longest,T_longest,C_longest,G_longest,Longest
count,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0
mean,0.495667,1.0,101.0,24.626167,24.407167,26.162333,25.804333,1.070667,3.0985,3.114833,3.137833,3.143,4.445167
std,0.500023,0.816565,0.0,7.672998,7.793149,8.079637,7.847801,0.256289,1.486438,1.497115,1.033447,1.053756,1.51626
min,0.0,0.0,101.0,2.0,2.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,101.0,19.0,19.0,21.0,20.0,1.0,2.0,2.0,2.0,2.0,4.0
50%,0.0,1.0,101.0,24.0,24.0,26.0,25.0,1.0,3.0,3.0,3.0,3.0,4.0
75%,1.0,2.0,101.0,30.0,30.0,31.0,31.0,1.0,4.0,4.0,4.0,4.0,5.0
max,1.0,2.0,101.0,56.0,58.0,57.0,56.0,2.0,22.0,24.0,12.0,15.0,24.0
