In [None]:
import pandas as pd
import numpy as np
from langdetect import detect

# ----- Define language detection functions -----
def isFrench(aString):
  if detect(aString) == 'fr':
    return True
  else:
    return False

def isEnglish(aString):
  if detect(aString) == 'en':
    return True
  else:
    return False

# ----- Separate original file into separate language files -----
csvDf = pd.read_csv('manabvndataset-abvn.csv', delimiter=',')

abbrev_eng = csvDf['Abbreviation']
term_eng = csvDf['Term']
frame_eng = {'Abbreviation': abbrev_eng, 'Term': term_eng}
frame_eng_df = pd.DataFrame(frame_eng)

abbrev_fr = csvDf['Abréviation']
term_fr = csvDf['Terme']
frame_fr = {'Abréviation': abbrev_fr, 'Terme': term_fr}
frame_fr_df = pd.DataFrame(frame_fr)

print("Initial Length Eng: ", len(frame_eng_df))
print("Initial Length Fr: ", len(frame_fr_df))

# ----- Remove rows with empty abbreviations ----- 
frame_eng_df['Abbreviation'].replace('', np.nan, inplace=True)
frame_eng_df.dropna(subset=['Abbreviation'], inplace=True)
frame_fr_df['Abréviation'].replace('', np.nan, inplace=True)
frame_fr_df.dropna(subset=['Abréviation'], inplace=True)

print("\nLength Eng (Remove Empty Abbreviations): ", len(frame_eng_df))
print("Length Fr (Remove Empty Abbreviations): ", len(frame_fr_df))

# ----- Make sure language is consistent -----
frame_eng_df = frame_eng_df[frame_eng_df['Term'].map(lambda x: isEnglish(x))]
frame_fr_df = frame_fr_df[frame_fr_df['Terme'].map(lambda x: isFrench(x))]

print("\nLength Eng (Filtered Language): ", len(frame_eng_df))
print("Length Fr (Filtered Language): ", len(frame_fr_df))

# ----- Remove duplicate rows -----
frame_eng_df = frame_eng_df.drop_duplicates()
frame_fr_df = frame_fr_df.drop_duplicates()

print("\nLength Eng (Removed Duplicates): ", len(frame_eng_df))
print("Length Fr (Removed Duplicates): ", len(frame_fr_df))

# ----- Export into CSV files -----
#frame_eng_df.to_csv('Abbrev_English.csv', index=False)
#frame_fr_df.to_csv('Abbrev_French.csv', index=False)