In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load datasets with necessary columns and drop duplicates
kotak_data = pd.read_csv("D:\\stock\\kotak.csv", usecols=['instrumentName', 'name', 'isin']).drop_duplicates()
fyers_data = pd.read_csv("D:\\stock\\fyers.csv", usecols=['Symbol ticker', 'Symbol Details', 'ISIN']).drop_duplicates()
angel_data = pd.read_csv("D:\\stock\\angelone.csv", usecols=['symbol', 'name']).drop_duplicates()
zerodha_data = pd.read_csv("D:\\stock\\zerodha.csv", usecols=['tradingsymbol', 'name']).drop_duplicates()


  kotak_data = pd.read_csv("D:\\deepak\\akash\\stock\\kotak.csv", usecols=['instrumentName', 'name', 'isin']).drop_duplicates()
  fyers_data = pd.read_csv("D:\\deepak\\akash\\stock\\fyers.csv", usecols=['Symbol ticker', 'Symbol Details', 'ISIN']).drop_duplicates()


In [3]:
# Rename columns for clarity
kotak_data.rename(columns={'instrumentName': 'Kotak ID', 'name': 'Instrument Name', 'isin': 'ISIN'}, inplace=True)
fyers_data.rename(columns={'Symbol ticker': 'Fyers ID', 'Symbol Details': 'Instrument Name'}, inplace=True)
angel_data.rename(columns={'symbol': 'Angel One ID', 'name': 'Instrument Name'}, inplace=True)
zerodha_data.rename(columns={'tradingsymbol': 'Zerodha ID', 'name': 'Instrument Name'}, inplace=True)


In [4]:
# Normalize Instrument Name for consistency
for df in [kotak_data, fyers_data, angel_data, zerodha_data]:
    df['Instrument Name'] = df['Instrument Name'].str.strip().str.upper()


In [5]:
# Filter out rows with NaN ISIN in both kotak_data and fyers_data
kotak_data = kotak_data[kotak_data['ISIN'].notna()]
fyers_data = fyers_data[fyers_data['ISIN'].notna()]


In [6]:
merged_df = pd.merge(kotak_data, fyers_data, on='ISIN', how='outer', suffixes=('_Kotak', '_Fyers'))


In [7]:
merged_df.sample(5)

Unnamed: 0,Kotak ID,Instrument Name_Kotak,ISIN,Instrument Name_Fyers,Fyers ID
3586,BALLARPUR,BALLARPUR INDUSTRIES LTD,INE294A01037,,
8053,,,IN2920200507,SDL RJ 6.62% 2030,NSE:662RJ30-SG
17849,,,INE584A08010,NSL-7.30%-28-8-25-PVT,BSE:730NSL25-F
10831,,,IN3420230242,SDL WB 7.44% 2044,NSE:744WB44-SG
11830,,,INE0BSN01013,S A TECH SOFTWARE INDIA L,NSE:SATECH-ST


In [8]:
# Ensure 'Instrument Name' exists in merged_df after merging
if 'Instrument Name_Kotak' in merged_df.columns:
    merged_df['Instrument Name'] = merged_df['Instrument Name_Kotak'].fillna(merged_df['Instrument Name_Fyers'])
elif 'Instrument Name_Fyers' in merged_df.columns:
    merged_df['Instrument Name'] = merged_df['Instrument Name_Fyers']


In [9]:
merged_df.sample(5)

Unnamed: 0,Kotak ID,Instrument Name_Kotak,ISIN,Instrument Name_Fyers,Fyers ID,Instrument Name
1458,JISLJALEQS,JAIN IRRIGATION SYSTEMS LTD,INE175A01038,JAIN IRRIGATION SYSTEMS,NSE:JISLJALEQS-EQ,JAIN IRRIGATION SYSTEMS LTD
13870,,,INE047E01031,JMD VENTURES LIMITED,BSE:JMDVL-X,JMD VENTURES LIMITED
5669,STARTECK,STARTECH FINANCE LTD,INE992I01013,STARTECK FINANCE LIMITED,BSE:STARTECK-B,STARTECH FINANCE LTD
15953,,,IN000550C011,GS06MAY50,BSE:GS06MAY50-G,GS06MAY50
10381,,,IN3120200214,SDL TN 6.41% 2030,NSE:641TN30-SG,SDL TN 6.41% 2030


In [10]:
# Drop redundant columns if they exist
merged_df.drop(columns=['Instrument Name_Kotak', 'Instrument Name_Fyers'], errors='ignore', inplace=True)


In [19]:
def jaccard_similarity(a, b):
    a_tokens = set(a.split())
    b_tokens = set(b.split())
    return len(a_tokens & b_tokens) / len(a_tokens | b_tokens)


In [21]:
def tfidf_cosine_similarity_with_jaccard(main_df, match_df, main_key, match_key, output_key, cos_threshold=0.7, jaccard_threshold=0.3):
    main_names = main_df[main_key].dropna().unique()
    match_names = match_df[match_key].dropna().unique()
    
    if len(main_names) == 0 or len(match_names) == 0:
        return pd.DataFrame(columns=['Instrument Name', output_key])
    
    all_names = pd.concat([pd.Series(main_names), pd.Series(match_names)]).unique()
    
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(all_names)
    
    main_vectors = tfidf_matrix[:len(main_names)]
    match_vectors = tfidf_matrix[len(main_names):]
    
    similarity_matrix = cosine_similarity(match_vectors, main_vectors)
    
    matched_pairs = []
    for idx, row in enumerate(similarity_matrix):
        best_match_idx = row.argmax()
        best_score = row[best_match_idx]
        if best_score >= cos_threshold:
            main_name = main_names[best_match_idx]
            match_name = match_names[idx]
            # Apply Jaccard similarity as a secondary check
            jaccard_score = jaccard_similarity(main_name, match_name)
            if jaccard_score >= jaccard_threshold:
                matched_pairs.append({
                    'Instrument Name': main_name,
                    output_key: match_name
                })
    return pd.DataFrame(matched_pairs)



In [22]:
angel_matches = tfidf_cosine_similarity_with_jaccard(merged_df, angel_data, 'Instrument Name', 'Instrument Name', 'Angel One ID', cos_threshold=0.7, jaccard_threshold=0.3)


In [24]:
angel_matches.sample(5)

Unnamed: 0,Instrument Name,Angel One ID
120,BIGBLOC CONSTRUCTION LTD,BIGBLOC
156,HDFCAMC - HFMPCC75RD,HFMPCC75RD
61,OMAXE LTD,OMAXE
114,KAUSHALYA LOGISTICS LTD,KAUSHALYA
67,UTIAMC - UTIRPP36P1,UTIRPP36P1


In [25]:
zerodha_matches = tfidf_cosine_similarity_with_jaccard(merged_df, zerodha_data, 'Instrument Name', 'Instrument Name', 'Zerodha ID', cos_threshold=0.7, jaccard_threshold=0.3)


In [26]:
zerodha_matches.sample(5)

Unnamed: 0,Instrument Name,Zerodha ID
8,LIC NOMURA MF ETF - NIFTY 100,INAV -LIC MF ETF NIFTY 100
75,IFCI LTD,IFCI
58,BIRLA CABLE LTD,BIRLA CABLE
51,BASF INDIA LTD,BASF INDIA
43,ATV PROJECTS INDIA LTD,ATV PROJECTS INDIA


In [34]:
# Merge the matches back into the main DataFrame
final_df = merged_df.merge(angel_matches, on='Instrument Name', how='left')


In [35]:
final_df = final_df.merge(zerodha_matches, on='Instrument Name', how='left')


In [43]:
final_df.sample(5)

Unnamed: 0,Instrument Name,Kotak ID,Fyers ID,Angel One ID,Zerodha ID
10694,ABHFL 8% 2034 SR J1,,NSE:8ABHF34-N4,,
7486,SDL HP 7.48% 2033,,NSE:748HP33-SG,,
7378,773GUJSDL32,,BSE:773GJ32-G,,
4643,COSMO FERRITES LTD,COSMOFE,BSE:COSMOFE-XT,,
5881,WALL STREET FINANCE LTD,WSFIN,BSE:WSFX-X,,


In [42]:
final_df = final_df[['Instrument Name', 'Kotak ID', 'Fyers ID', 'Angel One ID', 'Zerodha ID']]


In [44]:
final_df.shape

(20216, 5)

In [45]:
final_df.to_csv("D:\\stock\\final_mapping.csv")