In [1]:
import pandas as pd

In [2]:
path = '/itf-fi-ml/shared/users/ziyuzh/svm/data/pre_processed_features/expression_emb/gene2vec_dim_200_iter_9.txt'

# Read the data from the text file
with open(path, 'r') as file:
    lines = file.readlines()

# Create a dictionary to store the data
data_dict = {}

# Loop through each line and capture the information in the dictionary
for line in lines:
    parts = line.strip().split('\t')
    key = parts[0]
    values = list(map(float, parts[1].split()))
    data_dict[key] = values

# Convert the dictionary to a DataFrame
df = pd.DataFrame.from_dict(data_dict, orient='index')


In [3]:
import os
local_stringdb = os.path.join('/itf-fi-ml/shared/users/ziyuzh/svm/data/stringdb','2023')

ppidf = pd.read_csv(os.path.join(local_stringdb,'9606.protein.info.v12.0.txt'), sep='\t', header=0, usecols=['#string_protein_id', 'preferred_name'])
ppidf['preferred_name'] = ppidf['preferred_name'].str.upper()
stringId2name = ppidf.set_index('#string_protein_id')['preferred_name'].to_dict()
name2stringId = ppidf.set_index('preferred_name')['#string_protein_id'].to_dict()
ppidf = pd.read_csv(os.path.join(local_stringdb,'9606.protein.aliases.v12.0.txt'), sep='\t', header=0, usecols=['#string_protein_id', 'alias']).drop_duplicates(['alias'], keep='first')
ppidf['alias'] = ppidf['alias'].str.upper()
aliases2stringId = ppidf.set_index('alias')['#string_protein_id'].to_dict()

def string_convert(gene):
    if gene in name2stringId.keys():
        return name2stringId[gene]
    elif gene in aliases2stringId.keys():
        return aliases2stringId[gene]
    else:
        return None
    


In [4]:
df.shape

(24447, 200)

In [5]:
string_dict = dict()
for gene in df.index.tolist():
    string_dict[gene] = string_convert(gene)
df['string_id'] = df.index.map(string_dict)
df.shape

(24447, 201)

In [19]:
df[~df['string_id'].isna()]['string_id'].nunique()

17481

In [6]:
df = df.dropna(subset='string_id')
len(df)

17622

In [7]:
df_unique = df.drop_duplicates(subset='string_id')
len(df_unique)

17481

In [8]:
# Assuming you have a DataFrame df and a dictionary name2stringId
# Find duplicated rows based on 'string_id'
duplicated_rows = df[df.duplicated(subset='string_id', keep=False)]

# Filter rows based on the condition
rows_to_keep = duplicated_rows.index.isin(name2stringId.keys())

# Create a new DataFrame with the rows to keep
df_filtered = duplicated_rows[rows_to_keep]

# If you want to remove these from the original df:
df = df.drop(df.index.difference(df_filtered.index))

# Alternatively, if you want only the remaining DataFrame:
# df_filtered will be your final DataFrame with rows you want to keep


In [9]:
len(df)

95

In [10]:
df_combined = pd.concat([df_unique, df], axis=0)
len(df_combined)

17576

In [11]:
df_combined.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,string_id
CISD3,0.148815,-0.100394,0.139844,-0.156749,-0.000815,0.115267,0.101778,0.15858,0.287842,0.049861,...,0.086968,0.062115,0.314489,-0.148824,-0.121338,-0.245106,0.247893,-0.034497,0.038109,9606.ENSP00000483781
KDM1A,-0.037154,0.03877,-0.100821,0.16552,0.324188,0.095911,0.090766,0.336213,0.334166,-0.296031,...,-0.252097,0.266917,-0.333782,-0.011449,0.064207,0.033128,0.222754,0.234885,-0.410366,9606.ENSP00000383042
HIST1H2BN,-0.293039,-0.049834,-0.074517,-0.003607,0.06742,-0.154789,0.113428,-0.114366,0.048271,0.186912,...,0.056385,0.146475,-0.034893,0.039808,-0.165166,0.168566,-0.207136,-0.079345,0.191214,9606.ENSP00000483903


In [12]:
new_columns = ['string_id'] + [f'feature_{i}' for i, col in enumerate(df_combined.columns) if col != 'string_id']

# Reorder the DataFrame so that 'string_id' is the first column
df_combined = df_combined[['string_id'] + [col for col in df_combined.columns if col != 'string_id']]
df_combined.columns = new_columns

df_combined.head(2)

Unnamed: 0,string_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_190,feature_191,feature_192,feature_193,feature_194,feature_195,feature_196,feature_197,feature_198,feature_199
CISD3,9606.ENSP00000483781,0.148815,-0.100394,0.139844,-0.156749,-0.000815,0.115267,0.101778,0.15858,0.287842,...,0.227863,0.086968,0.062115,0.314489,-0.148824,-0.121338,-0.245106,0.247893,-0.034497,0.038109
KDM1A,9606.ENSP00000383042,-0.037154,0.03877,-0.100821,0.16552,0.324188,0.095911,0.090766,0.336213,0.334166,...,0.209735,-0.252097,0.266917,-0.333782,-0.011449,0.064207,0.033128,0.222754,0.234885,-0.410366


In [13]:
df_combined.to_csv('/itf-fi-ml/shared/users/ziyuzh/svm/data/pre_processed_features/expression_emb/exp_emb.csv',index=False)