# Data Processing
This file is going to be very important for this problem. While it is wise to preprocess, especially via data scaling, it is especially important in this case because the data is nonsense without it. Two of our three features aren't even numbers and are therefore meaningless without data processing.

# **Import Some Packages**

In [65]:
import pandas as pd
import numpy as np

# for fixing train.csv
import csv

# Defining some variables

In [66]:
tr_path = "../data/updated_train.csv"
old_tr_path = "../data/updated_train_nullpH.csv"
tt_path = "../data/test.csv"

# A simple approach 
- for protein_sequence, use its length
- for data_source, put them all into a dictionary and use its index. 

In [67]:
# when given a path: String, return a modified df: pd.DataFrame which
from numpy import dtype


def simple_convert(path):
    df = pd.read_csv(path, index_col=False)
    df.drop(columns=["pH", "data_source"])
    df['protein_sequence'] = df['protein_sequence'].apply(convToList)
    return df
    
def convToList(x):
    str = ""
    for i in range(len(x)):
        if i == len(x) - 1:
            str += f"{ord(x[i])}"
        else:
            str += f"{ord(x[i])}-"
    return str


In [68]:
df_train = simple_convert(tr_path)
print(df_train)

       seq_id                                   protein_sequence   pH  \
0           0  65-65-65-65-75-65-65-65-76-65-76-76-71-69-65-8...  7.0   
1           1  65-65-65-68-71-69-80-76-72-78-69-69-69-82-65-7...  7.0   
2           2  65-65-65-70-83-84-80-82-65-84-83-89-82-73-76-8...  7.0   
3           3  65-65-65-83-71-76-82-84-65-73-80-65-81-80-76-8...  7.0   
4           4  65-65-65-84-75-83-71-80-82-82-81-83-81-71-65-8...  7.0   
...       ...                                                ...  ...   
28690   28690  89-89-77-89-83-71-71-71-83-65-76-65-65-71-71-7...  7.0   
28691   28691  89-89-78-68-81-72-82-76-83-83-89-83-86-69-84-6...  7.0   
28692   28692  89-89-81-82-84-76-71-65-69-76-76-89-75-73-83-7...  7.0   
28693   28693  89-89-83-70-83-68-78-73-84-84-86-70-76-83-82-8...  7.0   
28694   28694  89-89-86-80-68-69-89-87-81-83-76-69-86-65-72-7...  7.0   

                             data_source    tm  
0      doi.org/10.1038/s41592-020-0801-4  75.7  
1      doi.org/10.1038/s4

# A better approach 
- for protein_sequence, use the information in wildtype_structure_prediction_af2.pdb
- for data_source, put them all into a dictionary and use its index. 

In [69]:
# def better_convert(path):
#     df = pd.read_csv(path)
#     df["protein_sequence"] = df["protein_sequence"].apply(lambda x: len(x), convert_dtype=int)
#     df["data_source"] = df["data_source"].apply(conv_datasrc, args=(data_source_dict,), convert_dtype=int)
#     return df

# Fixing train.csv
There are some null values for pH in about 200 of the 30,000 data samples, which are causing problems. So, we decided to make a new train.csv file with modifications

In [70]:
# Read in Data, modifiying it as we go
def remove_nullpH(file, newFile):
  rows = []
  with open(file, newline='') as csvfile:
    reader = csv.reader(csvfile)
    i = -1
    for row in reader:
      if i == -1:
        rows.append(row)
        i  = 0
        continue
      if i == 0 :
        i = int(row[0])
      elif row[2] == "":
        continue
      row[0] = str(i)
      rows.append(row)
      i += 1


  #Write the Data to File
  with open(newFile, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(rows)
  print("saved file to ", tr_path)

In [71]:
remove_nullpH(old_tr_path, tr_path)

saved file to  ../data/updated_train.csv
