# **METK Barley SNP-Chip:** Exploring the correlation between barley’s genetic makeup and its protein content
## Importing and editing the necessary datasets

In [29]:
import pandas as pd
snip_data = pd.read_csv("SNIP_DATA.csv")
barley_data = pd.read_csv("BARLEY_DATA.csv")

In [30]:
# Giving the first column in "SNIP_DATA.csv" a header since it was originally unnamed
snip_data.rename(columns={snip_data.columns[0]: 'SNP'}, inplace=True)

# Dropping ID column for "BARLEY_DATA.csv"
barley_data = barley_data.drop(columns='Id')

## Cleaning dataset 'SNIP_DATA.csv' based on the following criteria: 
### 1. Handling 'failed' values by replacing them with NaN

In [31]:
snip_data.replace('failed', pd.NA, inplace=True)

### 2. Removing SNPs with the same allele across all varieties

In [32]:
# Excluding the first column
snp_columns = snip_data.columns[1:]

# Filtering rows where there's more than one unique value in the SNP columns 
# This ignores NaN values so that if the row is ["A", "A", "A", "A", NaN, NaN] then it is still removed although there's 2 different values
snip_data = snip_data[snip_data[snp_columns].apply(lambda row: row.dropna().nunique() > 1, axis=1)]

### 3. Removing barley varieties that are not present in both datasets

In [33]:
# Function to normalize variety names because
# in dataset barley_data some varieties presented in form '5777.7.1.2' are written as 5777712 in snip_data
def normalize_variety_names(variety):
    return str(variety).replace('.', '')

In [34]:
# Normalizing variety names in barley_data
barley_data['Nimi'] = barley_data['Nimi'].apply(normalize_variety_names)

# Extracting variety names from snip_data (columns starting from the second column)
snip_varieties = set(snip_data.columns[1:])

# Extracting variety names from barley_data (row values in the appropriate column)
barley_varieties = set(barley_data['Nimi'])

# Finding common varieties
common_varieties = snip_varieties.intersection(barley_varieties)

# Filtering snip_data to keep only common varieties
snip_data = snip_data[['SNP'] + list(common_varieties)]

# Filtering barley_data to keep only rows with common varieties
barley_data = barley_data[barley_data['Nimi'].isin(common_varieties)]

## Processing the datasets 
### **In preparation for finding correlations between protein content and genetic makeup**

In [35]:
# Creating a copy with only the barley variety and protein columns
protein_data = barley_data[['Nimi', 'Proteiin']].copy()

### Merging datasets on variety name

In [None]:
# Transposing snip_data to have barley varieties as rows not columns
snip_data_transposed = snip_data.set_index('SNP').T.reset_index()
snip_data_transposed.rename(columns={'index': 'Nimi'}, inplace=True)


# Merging protein_data with the transposed snip_data
merged_data = protein_data.merge(snip_data_transposed, on='Nimi', how='inner')


Unnamed: 0,Nimi,Proteiin,BK_01,BK_03,BK_05,BK_08,BK_10,BK_12,BK_14,BK_17,...,TGBA15K-TG0384,TGBA15K-TG0385,TGBA15K-TG0386,TGBA15K-TG0388,TGBA15K-TG0395,TGBA15K-TG0400,TGBA15K-TG0402_NC_MA,TGBA15K-TG0402_NG_MA,TGBA15K-TG0403,TGBA15K-TG0409
0,Amidala,12.2,,T,C,G,,A,A,C,...,C,T,G,G,C,A,T,T,C,G
1,Amy,12.7,T,T,C,G,A,A,A,C,...,C,T,G,G,C,A,T,T,C,G
2,Anneli,13.0,T,T,T,G,A,A,A,C,...,C,G,G,G,C,A,T,T,C,A
3,Anni,12.4,T,T,C,C,A,A,A,G,...,C,T,T,G,C,A,T,T,C,G
4,Annika,11.2,T,T,C,G,A,A,A,C,...,C,T,G,G,C,A,T,T,C,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,6006142,12.1,T,T,C,C,A,A,A,C,...,C,T,G,G,C,A,T,T,C,G
167,6006153,11.3,T,T,C,G,A,A,A,C,...,C,T,G,G,C,A,T,T,C,G
168,6011421,11.4,T,T,C,C,A,A,A,C,...,C,T,T,G,C,A,T,T,C,G
169,6012243,11.6,T,T,C,G,A,A,A,C,...,C,T,T,G,C,A,T,T,C,G


### Seeing what different types of values we have as alleles in preparation for a one-hot encoding

In [None]:
# Extracting allele columns
allele_columns = merged_data.columns[2:]

# Flattening all values from allele columns into a single series and dropping NaN
all_alleles = merged_data[allele_columns].stack().dropna()

# Counting the occurrences of each allele value
allele_counts = all_alleles.value_counts()

# Displaying the counts
print(allele_counts)

G    562756
A    540192
C    492895
T    429012
R      2030
Y      1405
K       466
M       416
S        82
W        45
Name: count, dtype: int64


**Brief biological explanation:**

| Nucleotide Symbol | Full Name                       |
|-------------------|---------------------------------|
| A                 | Adenine                         |
| C                 | Cytosine                        |
| G                 | Guanine                         |
| T                 | Thymine                         |
| R                 | Guanine / Adenine (purine)      |
| Y                 | Cytosine / Thymine (pyrimidine) |
| K                 | Guanine / Thymine               |
| M                 | Adenine / Cytosine              |
| S                 | Guanine / Cytosine              |
| W                 | Adenine / Thymine               |
