In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# **About the data **

It is from a kaggle dataset about genes
https://www.kaggle.com/datasets/mohamedabdullah/human-genes



In [2]:
#Reading data
genes = pd.read_csv("humangenes.csv")
genes.head()

Unnamed: 0,id,shortName,FullName
0,1,AAAS,aladin WD repeat nucleoporin
1,2,AASS,aminoadipate-semialdehyde synthase
2,3,ABAT,4-aminobutyrate aminotransferase
3,4,ABCA1,ATP binding cassette subfamily A member 1
4,5,ABCA3,ATP binding cassette subfamily A member 3


In [3]:
genes.tail()

Unnamed: 0,id,shortName,FullName
1467,1468,ZFP57,ZFP57 zinc finger protein
1468,1469,ZFYVE26,zinc finger FYVE-type containing 26
1469,1470,ZIC2,Zic family member 2
1470,1471,ZMPSTE24,zinc metallopeptidase STE24
1471,1472,ZMYM2,zinc finger MYM-type containing 2


The wording in full name is a little unorganized and viewing it by short name may not be as accurate because their maybe different variations.

**However, the short names seem to be well abbreviated within the first 4 rows and last 5, so it may not be inaccurate.** ðŸ˜Š

# checking data for duplicates and if it can be used for ML

# **Question:**
 how many different genes are there?

In [4]:
#trying ML to categorize them by amount...
#Con to this is that I won't know which gene is which, but will know of any clusters


label_encoder = LabelEncoder()
# Fit and transform the short name column to numerical labels
gene_numerical_labels = label_encoder.fit_transform(genes['shortName'])
print("Text Labels:", label_encoder)
print("Numerical Labels:", gene_numerical_labels)


Text Labels: LabelEncoder()
Numerical Labels: [   0    1    2 ... 1469 1470 1471]


In [5]:
genes.head()

Unnamed: 0,id,shortName,FullName
0,1,AAAS,aladin WD repeat nucleoporin
1,2,AASS,aminoadipate-semialdehyde synthase
2,3,ABAT,4-aminobutyrate aminotransferase
3,4,ABCA1,ATP binding cassette subfamily A member 1
4,5,ABCA3,ATP binding cassette subfamily A member 3


# **Answer:**
based off trying to label each short name into a categorical value by numerical labels, each would be labeled individually. Therefore there are 1471 different genes. None should be duplicated

In [6]:
#Dropping duplicates for accuracy
gene_df =pd.DataFrame(genes)
gene_df_no_duplicates = gene_df.drop_duplicates()

gene_df_no_duplicates.tail()

Unnamed: 0,id,shortName,FullName
1467,1468,ZFP57,ZFP57 zinc finger protein
1468,1469,ZFYVE26,zinc finger FYVE-type containing 26
1469,1470,ZIC2,Zic family member 2
1470,1471,ZMPSTE24,zinc metallopeptidase STE24
1471,1472,ZMYM2,zinc finger MYM-type containing 2


# Categorizing gene names that match the initials for B.R.I.A.U.N.A

# **Question:**
Can we classify them by being word of gene and see how many there is in the data?

Choose the genes that begin with the letters in my name

# **Answer:**
**USE THIS CODE BELOW QUERY TO FILTER BY GENE NAME**


b_in_beginning = 'B'  

filtered_data_withb = gene_df[gene_df['FullName'].str.startswit(b_in_beginning)]

**USE CODE BELOW TO PRINT VALUES**

print("Data with 'Full Names' starting with 'b':")

print(filtered_data_withb)

print("________________________________")

print("There are a total of",filtered_data_withb['FullName'].count(),"human genes within the b section of this gene data")

print("Also there are 12 groups of genes that start with b")

# **B**

In [7]:
gene_df = pd.DataFrame(gene_df_no_duplicates)


b_in_beginning = 'B'  # Beginning part of the values I want

# Filter rows where the 'FullName' column starts with 'ATP'
filtered_data_withb = gene_df[gene_df['FullName'].str.startswith(b_in_beginning)]

print("Data with 'Full Names' starting with 'b':")
print(filtered_data_withb)
print("________________________________")
print("There are a total of",filtered_data_withb['FullName'].count(),"human genes within the b section of this gene data")
print("Also there are 12 groups of genes that start with b")

Data with 'Full Names' starting with 'b':
      id shortName                                           FullName
157  158      BAP1                         BRCA1 associated protein 1
158  159      BBS1                            Bardet-Biedl syndrome 1
159  160     BBS10                           Bardet-Biedl syndrome 10
163  164      BCOR                                   BCL6 corepressor
164  165       BCR          BCR, RhoGEF and GTPase activating protein
165  166     BCS1L  BCS1 homolog, ubiquinol-cytochrome c reductase...
168  169     BICD2                               BICD cargo adaptor 2
170  171       BLM                             BLM RecQ like helicase
174  175      BRAF      B-Raf proto-oncogene, serine/threonine kinase
175  176     BRCA1                       BRCA1, DNA repair associated
176  177     BRCA2                       BRCA2, DNA repair associated
177  178     BSCL2  BSCL2, seipin lipid droplet biogenesis associated
180  181       BTK                             B

**id 158 = BRCA1 associated protein 1 [cancer]**

**id 159 & 160 = Bardet-Biedl syndrome 1 & Bardet-Biedl syndrome 10 [vision issues at night & peripheral]**

id 164 = BCL6 corepressor -- eye formation -- |
id 165 = BCR, RhoGEF and GTPase activating protein --helps cell function & movement--|

id 166 = causes hair issues & hearing problems

id 169 =  ??

id 171 = regulates dna replication

**id 175 = regulation & can cause cancer if mutated**

id 176 = dna repair

id 177 = dna repair

id 178 = liipid droplets

id 181 = helps protect body from infections

id 182 = ??

# **R**

In [8]:
gene_df = pd.DataFrame(gene_df_no_duplicates)


r_in_beginning = 'r'  # Beginning part of the values I want

# Filter rows where the 'FullName' column starts with 'ATP'
filtered_data_withr = gene_df[gene_df['FullName'].str.startswith(r_in_beginning)]

print("Data with 'Full Names' starting with 'r':")
print(filtered_data_withr)
print("________________________________")
print("There are a total of",filtered_data_withr['FullName'].count(),"human genes within the r section of this gene data")
print("Also there are 24 groups of genes that start with r")

Data with 'Full Names' starting with 'r':
        id shortName                                           FullName
1096  1097      RAG1                         recombination activating 1
1097  1098      RAG2                         recombination activating 2
1098  1099      RAI1                            retinoic acid induced 1
1100  1101     RAPSN         receptor associated protein of the synapse
1101  1102      RARA                       retinoic acid receptor alpha
1106  1107      RBPJ  recombination signal binding protein for immun...
1107  1108      RDH5                            retinol dehydrogenase 5
1109  1110     REEP1                       receptor accessory protein 1
1110  1111      RELN                                             reelin
1111  1112       REN                                              renin
1113  1114       RET                                 ret proto-oncogene
1114  1115   RETREG1                          reticulophagy regulator 1
1115  1116      RFX5  

# **I**

In [9]:
gene_df = pd.DataFrame(gene_df_no_duplicates)


I_in_beginning = 'I'  # Beginning part of the values I want

# Filter rows where the 'FullName' column starts with 'ATP'
filtered_data_withI = gene_df[gene_df['FullName'].str.startswith(I_in_beginning)]

print("Data with 'Full Names' starting with 'r':")
print(filtered_data_withI)
print("________________________________")
print("There are a total of",filtered_data_withI['FullName'].count(),"human genes within the I section of this gene data")


Data with 'Full Names' starting with 'r':
Empty DataFrame
Columns: [id, shortName, FullName]
Index: []
________________________________
There are a total of 0 human genes within the I section of this gene data


# **A**

In [10]:
gene_df = pd.DataFrame(gene_df_no_duplicates)


a_in_beginning = 'A'  # Beginning part of the values I want

# Filter rows where the 'FullName' column starts with 'ATP'
filtered_data_withA = gene_df[gene_df['FullName'].str.startswith(a_in_beginning)]

print("Data with 'Full Names' starting with 'r':")
print(filtered_data_withA)
print("________________________________")
print("There are a total of",filtered_data_withA['FullName'].count(),"human genes within the A section of this gene data")
print("Also there are 27 groups of genes that start with A")

Data with 'Full Names' starting with 'r':
      id shortName                                           FullName
3      4     ABCA1          ATP binding cassette subfamily A member 1
4      5     ABCA3          ATP binding cassette subfamily A member 3
5      6     ABCA4          ATP binding cassette subfamily A member 4
6      7    ABCA12         ATP binding cassette subfamily A member 12
7      8     ABCB4          ATP binding cassette subfamily B member 4
8      9     ABCB7          ATP binding cassette subfamily B member 7
9     10    ABCB11         ATP binding cassette subfamily B member 11
10    11     ABCC2          ATP binding cassette subfamily C member 2
11    12     ABCC6          ATP binding cassette subfamily C member 6
12    13     ABCC8          ATP binding cassette subfamily C member 8
13    14     ABCC9          ATP binding cassette subfamily C member 9
14    15     ABCD1          ATP binding cassette subfamily D member 1
15    16     ABCD4          ATP binding cassette

# **U**

In [11]:
gene_df = pd.DataFrame(gene_df_no_duplicates)


U_in_beginning = 'U'  # Beginning part of the values I want

# Filter rows where the 'FullName' column starts with 'ATP'
filtered_data_withU = gene_df[gene_df['FullName'].str.startswith(U_in_beginning)]

print("Data with 'Full Names' starting with 'r':")
print(filtered_data_withU)
print("________________________________")
print("There are a total of",filtered_data_withU['FullName'].count(),"human genes within the U section of this gene data")
print("Also there are 5 groups of genes that start with U")

Data with 'Full Names' starting with 'r':
        id shortName                                        FullName
537    538      GALE                       UDP-galactose-4-epimerase
1423  1424    UGT1A1  UDP glucuronosyltransferase family 1 member A1
1430  1431      USB1         U6 snRNA biogenesis phosphodiesterase 1
1432  1433      UTP4        UTP4, small subunit processome component
1433  1434     UVSSA                UV stimulated scaffold protein A
________________________________
There are a total of 5 human genes within the U section of this gene data
Also there are 5 groups of genes that start with U


# **N**

In [12]:
gene_df = pd.DataFrame(gene_df_no_duplicates)


N_in_beginning = 'N'  # Beginning part of the values I want

# Filter rows where the 'FullName' column starts with 'ATP'
filtered_data_withN = gene_df[gene_df['FullName'].str.startswith(N_in_beginning)]

print("Data with 'Full Names' starting with 'r':")
print(filtered_data_withN)
print("________________________________")
print("There are a total of",filtered_data_withN['FullName'].count(),"human genes within the N section of this gene data")
print("Also there are 17 groups of genes that start with N")

Data with 'Full Names' starting with 'r':
        id shortName                                           FullName
124    125     ASAH1                 N-acylsphingosine amidohydrolase 1
580    581    GNPTAB  N-acetylglucosamine-1-phosphate transferase su...
581    582     GNPTG  N-acetylglucosamine-1-phosphate transferase su...
891    892     NAGLU                     N-acetyl-alpha-glucosaminidase
892    893      NAGS                         N-acetylglutamate synthase
899    900       NDP             NDP, norrin cystine knot growth factor
904    905    NFKBIA                               NFKB inhibitor alpha
905    906      NFU1                  NFU1 iron-sulfur cluster scaffold
907    908     NGLY1                                      N-glycanase 1
908    909    NHLRC1  NHL repeat containing E3 ubiquitin protein lig...
909    910     NIPBL                      NIPBL, cohesin loading factor
910    911    NKX2-1                                     NK2 homeobox 1
911    912     NLRP1  

# **A**
my first name is **BRIAUNA**
no need to repeat CODE since I have already done A



In [13]:
#combining all df to one to make "Briauna's Genes"
Briauna_genes_ = pd.concat([filtered_data_withb, filtered_data_withr, filtered_data_withI, filtered_data_withA, filtered_data_withU, filtered_data_withN], axis=0)
Briauna_genes_.head()

print("Briauna has a total of 149 genes which is doubled by both parents to total 298")
print("There are 85 different kinds of genes")

Briauna has a total of 149 genes which is doubled by both parents to total 298
There are 85 different kinds of genes


In [14]:
Briauna_genes_.to_csv('Briauna_genes_.csv', index=False)