# Data exploration

* This is a data exploration notebook. We aim to analyze the data to gain insight on the classification task that consists in predicting whether a particular DNA sequence exhibits favorable properties for bounding a protein.

## Imports

In [6]:
import csv
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Data loading

* We investigate directly the *raw* data (i.e. the sequences of *A*, *T*, *C* and *G*) rather than the intermediate representation obtained by using K-Means on sub-sequences.

In [7]:
CWD = os.getcwd()
DATA_DIR = os.path.join(CWD, "data")
RESULT_DIR = os.path.join(CWD, "results")

FILES = {0: {"train_mat": "Xtr0_mat100.csv",
             "train": "Xtr0.csv",
             "test_mat": "Xte0_mat100.csv",
             "test": "Xte0.csv",
             "label": "Ytr0.csv"},
         1: {"train_mat": "Xtr1_mat100.csv",
             "train": "Xtr1.csv",
             "test_mat": "Xte1_mat100.csv",
             "test": "Xte1.csv",
             "label": "Ytr1.csv"},
         2: {"train_mat": "Xtr2_mat100.csv",
             "train": "Xtr2.csv",
             "test_mat": "Xte2_mat100.csv",
             "test": "Xte2.csv",
             "label": "Ytr2.csv"}}

In [8]:
def load_data(file_id, mat=True):
    
    X_train = list()
    Y_train = list()
    X_test = list()
    
    dic = FILES[file_id]
    
    if mat:
        files = [dic["train_mat"], dic["label"], dic["test_mat"]]
    else:
        files = [dic["train"], dic["label"], dic["test"]]

    for file, l in zip(files, [X_train, Y_train, X_test]):
        with open(os.path.join(DATA_DIR, file), "r", newline="") as csvfile:
            if "mat" in file:
                reader = csv.reader(csvfile, delimiter=" ")
                for row in reader:
                    l.append(row)
            else:
                reader = csv.reader(csvfile, delimiter=",")
                next(reader, None) # Skip the header
                for row in reader:
                    l.append(row[1])
                
    if mat:
        X_train = np.array(X_train).astype("float")
        Y_train = np.array(Y_train).astype("int")
        X_test = np.array(X_test).astype("float")
        np.random.seed(0)
        index = np.random.permutation(len(X_train))
        X_train = X_train[index]
        Y_train = Y_train[index]
    
    else:
        np.random.seed(0)
        index = np.random.permutation(len(X_train))
        X_train = [X_train[i] for i in index]
        Y_train = [Y_train[i] for i in index]
        Y_train = np.array(Y_train).astype("int")
        
    
    return X_train, Y_train, X_test

In [9]:
# Instanciate dictionaries
TRAIN_DIC = {i: {"sequence": None, "label": None} for i in range(len(FILES))}
TEST_DIC = {i: {"sequence": None} for i in range(len(FILES))}

# Populate dictionaries
for i in range(len(FILES)):
    X_train, Y_train, X_test = load_data(i, mat=False)
    TRAIN_DIC[i]["sequence"] = X_train
    TRAIN_DIC[i]["label"] = Y_train
    TEST_DIC[i]["sequence"] = X_test

# Populate list of dictionaries to concatenate them below
train_dfs = list()
test_dfs = list()

for i in range(len(FILES)):
    tmp_train = pd.DataFrame(data=TRAIN_DIC[i])
    tmp_train["dataset"] = i # dataset index
    train_dfs.append(tmp_train)
    tmp_test = pd.DataFrame(data=TEST_DIC[i])
    tmp_test["dataset"] = i # dataset index
    test_dfs.append(tmp_test)

# Build pandas dataframe (training set)
train_df = pd.concat(train_dfs, ignore_index=True)
# Build pandas dataframe (testing set)
test_df = pd.concat(test_dfs, ignore_index=True)
train_df.head()

Unnamed: 0,sequence,label,dataset
0,TTTTTGGAGATGGAATTTCAATCTTGTTGCCCAGGGTGGAATGCAA...,0,0
1,TTTTTTTAGATGGAGTCTCACTCTTGTCGCCCAGGCTGGAGTGCAA...,0,0
2,GTACAACGGAGATAATCATCTGAGCTCTGTCTGCTTCCTCTGGCTA...,1,0
3,TTTCTAGACAGGAAAATTGAAACATATTAGCTTTATTCATGTATAG...,0,0
4,GAAGAATGGCGTGAACCTGGGAGGCAGAGGTTGCAGTGAGCCGAGA...,0,0


In [15]:
def compute_stats(df, test=False):
    """
    Process a dataframe to compute statistics 
    on the sequences present in the three datasets
    
    Parameters
    -----------
    - df : pandas.DataFrame
        Dataframe on which to compute statistics (either train or test)
    
    - test : boolean (optional)    
        True if dealing with test data (i.e. skip duplicates label sanity check)
    
    Returns
    -----------
    - df : pandas.DataFrame
        Original dataframe populated with new statistics
    """

    # Length of the strings
    df["seq_length"] = df.sequence.str.len()

    # Number of A, T, C, G
    df["As"] = df.sequence.str.count("A")
    df["Ts"] = df.sequence.str.count("T")
    df["Cs"] = df.sequence.str.count("C")
    df["Gs"] = df.sequence.str.count("G")
    assert all(df.As + df.Ts + df.Cs + df.Gs == df.seq_length) # There are only A, T, C and G
    df.head()

    # TODO: intra and extra occurences (i.e. how many occurences we have for each sequence in its own dataset VS in all datasets)

    # Extra occurences
    _, inv, c = np.unique(df.sequence.values, return_inverse=True, return_counts=True)
    df["unique_all"] = [c[inv[i]] for i in range(len(inv))]

    if not test:
        # Assert that the sequence multiple occurences have the same label
        doubles_df = df[df.unique_all > 1].sort_values(by="sequence")[["sequence", "label"]]

        values = doubles_df.iloc[0].values
        curr_seq = values[0]
        curr_label = values[1]

        for i in range(1, len(doubles_df)):
            values = doubles_df.iloc[i].values
            next_seq = values[0]
            next_label = values[1]
            if next_seq != curr_seq:
                continue
            else:
                if next_label != curr_label:
                    raise ValueError(f"Duplicate ({next_seq}) has different labels! ")

    # TODO: Intra occurences


    # Longest straight sequence (letter and number)
    for k, seq in enumerate(df.sequence.values):
        counts = {a: 0 for a in ["A", "T", "C", "G"]}
        curr_count = 0
        curr_char = seq[0]
        curr_count += 1
        for i in range(1, len(seq)):
            next_char = seq[i]
            if next_char != curr_char:
                if curr_count > counts[curr_char]:
                    counts[curr_char] = curr_count
                # reset current count
                curr_count = 0
                curr_char = next_char
                curr_count += 1
            else:
                curr_count += 1
        df.loc[k, "A_longest"] = counts["A"]
        df.loc[k, "T_longest"] = counts["T"]
        df.loc[k, "C_longest"] = counts["C"]
        df.loc[k, "G_longest"] = counts["G"]
        df.loc[k, "Longest"] = max(counts.values())
        df.loc[k, "Longest_char"] = max(counts, key=counts.get)

    return df    

In [13]:
train_stats_df = compute_stats(train_df)
train_stats_df.describe()

Unnamed: 0,label,dataset,seq_length,As,Ts,Cs,Gs,unique_all,A_longest,T_longest,C_longest,G_longest,Longest
count,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0
mean,0.495667,1.0,101.0,24.626167,24.407167,26.162333,25.804333,1.070667,3.0985,3.114833,3.137833,3.143,4.445167
std,0.500023,0.816565,0.0,7.672998,7.793149,8.079637,7.847801,0.256289,1.486438,1.497115,1.033447,1.053756,1.51626
min,0.0,0.0,101.0,2.0,2.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,101.0,19.0,19.0,21.0,20.0,1.0,2.0,2.0,2.0,2.0,4.0
50%,0.0,1.0,101.0,24.0,24.0,26.0,25.0,1.0,3.0,3.0,3.0,3.0,4.0
75%,1.0,2.0,101.0,30.0,30.0,31.0,31.0,1.0,4.0,4.0,4.0,4.0,5.0
max,1.0,2.0,101.0,56.0,58.0,57.0,56.0,2.0,22.0,24.0,12.0,15.0,24.0


In [21]:
train_stats_df[train_stats_df.dataset == 0].describe()

Unnamed: 0,label,dataset,seq_length,As,Ts,Cs,Gs,unique_all,A_longest,T_longest,C_longest,G_longest,Longest
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,0.4885,0.0,101.0,28.068,28.1635,22.2945,22.474,1.011,3.545,3.6855,2.838,2.845,4.715
std,0.499993,0.0,0.0,7.291087,7.455707,6.666858,6.490716,0.104329,1.612227,1.75446,0.974798,0.928119,1.804273
min,0.0,0.0,101.0,6.0,6.0,3.0,3.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,0.0,0.0,101.0,23.0,23.0,18.0,18.0,1.0,3.0,3.0,2.0,2.0,4.0
50%,0.0,0.0,101.0,28.0,28.0,22.0,22.0,1.0,3.0,3.0,3.0,3.0,4.0
75%,1.0,0.0,101.0,33.0,33.0,26.0,27.0,1.0,4.0,4.0,3.0,3.0,5.0
max,1.0,0.0,101.0,52.0,53.0,55.0,53.0,2.0,22.0,24.0,12.0,7.0,24.0


In [22]:
train_stats_df[train_stats_df.dataset == 1].describe()

Unnamed: 0,label,dataset,seq_length,As,Ts,Cs,Gs,unique_all,A_longest,T_longest,C_longest,G_longest,Longest
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,0.499,1.0,101.0,23.172,23.041,27.5915,27.1955,1.201,3.0495,2.989,3.2955,3.303,4.416
std,0.500124,0.0,0.0,7.397823,7.365354,7.811859,7.802289,0.400848,1.417058,1.263201,1.004831,1.093521,1.392449
min,0.0,1.0,101.0,4.0,5.0,7.0,5.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,0.0,1.0,101.0,18.0,18.0,22.0,21.75,1.0,2.0,2.0,3.0,3.0,4.0
50%,0.0,1.0,101.0,23.0,22.0,27.0,27.0,1.0,3.0,3.0,3.0,3.0,4.0
75%,1.0,1.0,101.0,28.0,28.0,33.0,32.0,1.0,4.0,3.0,4.0,4.0,5.0
max,1.0,1.0,101.0,56.0,58.0,57.0,55.0,2.0,22.0,18.0,8.0,15.0,22.0


In [23]:
train_stats_df[train_stats_df.dataset == 2].describe()

Unnamed: 0,label,dataset,seq_length,As,Ts,Cs,Gs,unique_all,A_longest,T_longest,C_longest,G_longest,Longest
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,0.4995,2.0,101.0,22.6385,22.017,28.601,27.7435,1.0,2.701,2.67,3.28,3.281,4.2045
std,0.500125,0.0,0.0,7.132689,7.122083,8.214228,8.063822,0.0,1.289741,1.229982,1.053639,1.068461,1.254379
min,0.0,2.0,101.0,2.0,2.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,0.0,2.0,101.0,18.0,17.0,23.0,23.0,1.0,2.0,2.0,3.0,3.0,4.0
50%,0.0,2.0,101.0,22.0,21.0,28.0,27.0,1.0,2.0,2.0,3.0,3.0,4.0
75%,1.0,2.0,101.0,27.0,26.0,34.0,33.0,1.0,3.0,3.0,4.0,4.0,5.0
max,1.0,2.0,101.0,49.0,48.0,55.0,56.0,1.0,17.0,17.0,8.0,9.0,17.0


In [17]:
test_stats_df = compute_stats(test_df, test=True)
test_stats_df.describe()

Unnamed: 0,dataset,seq_length,As,Ts,Cs,Gs,unique_all,A_longest,T_longest,C_longest,G_longest,Longest
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,1.0,101.0,24.53,24.374,26.047667,26.048333,1.038667,3.060667,3.158667,3.131333,3.138,4.468333
std,0.816633,0.0,7.655846,7.73098,8.173596,7.994037,0.192831,1.446035,1.621018,1.019034,1.071761,1.583193
min,0.0,101.0,3.0,2.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,0.0,101.0,19.0,19.0,20.0,21.0,1.0,2.0,2.0,2.0,2.0,4.0
50%,1.0,101.0,24.0,24.0,26.0,26.0,1.0,3.0,3.0,3.0,3.0,4.0
75%,2.0,101.0,30.0,29.0,32.0,31.0,1.0,4.0,4.0,4.0,4.0,5.0
max,2.0,101.0,53.0,56.0,60.0,64.0,2.0,26.0,28.0,8.0,12.0,28.0


In [18]:
test_stats_df[test_stats_df.dataset == 0].describe()

Unnamed: 0,dataset,seq_length,As,Ts,Cs,Gs,unique_all,A_longest,T_longest,C_longest,G_longest,Longest
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.0,101.0,28.088,28.247,22.089,22.576,1.006,3.548,3.825,2.809,2.832,4.785
std,0.0,0.0,7.549918,7.211657,6.816858,6.406658,0.077266,1.66387,2.0299,0.922697,0.935228,2.095511
min,0.0,101.0,6.0,7.0,5.0,8.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,0.0,101.0,23.0,23.75,17.0,18.0,1.0,2.0,3.0,2.0,2.0,4.0
50%,0.0,101.0,29.0,28.0,22.0,22.0,1.0,3.0,3.0,3.0,3.0,4.0
75%,0.0,101.0,33.0,33.0,26.0,27.0,1.0,4.0,4.0,3.0,3.0,5.0
max,0.0,101.0,53.0,55.0,55.0,52.0,2.0,26.0,28.0,7.0,7.0,28.0


In [19]:
test_stats_df[test_stats_df.dataset == 1].describe()

Unnamed: 0,dataset,seq_length,As,Ts,Cs,Gs,unique_all,A_longest,T_longest,C_longest,G_longest,Longest
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1.0,101.0,23.154,22.928,27.432,27.486,1.11,2.982,3.011,3.293,3.309,4.371
std,0.0,0.0,7.068681,7.20352,7.492898,7.844319,0.313046,1.213734,1.307895,0.976783,1.051963,1.24133
min,1.0,101.0,5.0,6.0,9.0,7.0,1.0,1.0,1.0,1.0,1.0,3.0
25%,1.0,101.0,18.0,18.0,22.0,22.0,1.0,2.0,2.0,3.0,3.0,4.0
50%,1.0,101.0,23.0,22.0,27.0,27.0,1.0,3.0,3.0,3.0,3.0,4.0
75%,1.0,101.0,28.0,28.0,32.0,32.0,1.0,3.0,4.0,4.0,4.0,5.0
max,1.0,101.0,53.0,56.0,57.0,64.0,2.0,14.0,14.0,8.0,8.0,14.0


In [20]:
test_stats_df[test_stats_df.dataset == 2].describe()

Unnamed: 0,dataset,seq_length,As,Ts,Cs,Gs,unique_all,A_longest,T_longest,C_longest,G_longest,Longest
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,2.0,101.0,22.348,21.947,28.622,28.083,1.0,2.652,2.64,3.292,3.273,4.249
std,0.0,0.0,7.046456,7.247117,8.585151,8.425028,0.0,1.274567,1.149662,1.075124,1.151432,1.197678
min,2.0,101.0,3.0,2.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,2.0,101.0,18.0,17.0,23.0,23.0,1.0,2.0,2.0,3.0,3.0,4.0
50%,2.0,101.0,22.0,21.0,28.0,27.0,1.0,2.0,2.0,3.0,3.0,4.0
75%,2.0,101.0,27.0,26.0,34.0,33.0,1.0,3.0,3.0,4.0,4.0,5.0
max,2.0,101.0,51.0,48.0,60.0,57.0,1.0,14.0,9.0,8.0,12.0,14.0
