# Analysis of barley harvested in 2024
## Importing the necessary datasets

In [62]:
import pandas as pd
snip_data = pd.read_csv("SNIP_DATA.csv")
barley_data = pd.read_csv("BARLEY_DATA.csv")

In [63]:
# Giving the first column in "SNIP_DATA.csv" a header since it was originally unnamed
snip_data.rename(columns={snip_data.columns[0]: 'SNP'}, inplace=True)

## Cleaning dataset 'SNIP_DATA.csv' based on the following criteria: 
### 1. Handling 'failed' values by replacing them with NaN

In [64]:
snip_data.replace('failed', pd.NA, inplace=True)

### 2. Removing SNPs with the same allele across all varieties

In [65]:
# Excluding the first column
snp_columns = snip_data.columns[1:]

# Filtering rows where there's more than one unique value in the SNP columns 
# This ignores NaN values so that if the row is ["A", "A", "A", "A", NaN, NaN] then it is still removed although there's 2 different values
snip_data = snip_data[snip_data[snp_columns].apply(lambda row: row.dropna().nunique() > 1, axis=1)]

### 3. Removing barley varieties that are not present in both datasets

In [66]:
# Function to normalize variety names because
# in dataset barley_data some varieties presented in form '5777.7.1.2' are written as 5777712 in snip_data
def normalize_variety_names(variety):
    return str(variety).replace('.', '')

In [None]:
# Normalizing variety names in barley_data
barley_data['Nimi'] = barley_data['Nimi'].apply(normalize_variety_names)

# Extracting variety names from snip_data (columns starting from the second column)
snip_varieties = set(snip_data.columns[1:])

# Extracting variety names from barley_data (row values in the appropriate column)
barley_varieties = set(barley_data['Nimi'])

# Finding common varieties
common_varieties = snip_varieties.intersection(barley_varieties)

# Filtering snip_data to keep only common varieties
snip_data = snip_data[['SNP'] + list(common_varieties)]

# Filtering barley_data to keep only rows with common varieties
barley_data = barley_data[barley_data['Nimi'].isin(common_varieties)]

Unnamed: 0,Id,Nimi,Terasaak,Seisukindlus,Mass,Pikkus,Külv-küpsus,Külv-loomine,Loomine-küpsus,Äärislaiksus,Võrk,Pruun,Jahukaste,Proteiin
1,2,Amidala,4958,9,49.5,59,85,52,33,1,4.0,4.0,2.0,12.2
2,3,Amy,4540,9,44.5,57,86,55,32,1,5.0,4.0,1.0,12.7
3,4,Anneli,4589,9,43.9,56,87,54,33,1,4.0,3.0,2.0,13.0
4,5,Anni,4556,9,43.9,55,89,57,32,1,3.5,3.5,4.5,12.4
5,6,Annika,4551,9,45.4,52,87,52,34,1,3.5,3.5,2.0,11.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186,187,6006142,4658,9,43.3,52,92,57,35,1,5.0,4.5,1.0,12.1
187,188,6006153,5193,9,40.8,55,92,58,34,1,5.0,4.0,1.0,11.3
188,189,6011421,4629,9,46.0,53,92,58,34,1,4.5,4.0,1.0,11.4
189,190,6012243,5256,9,43.3,55,92,58,34,1,4.0,3.5,3.0,11.6
