In [43]:
import pandas as pd
import re

In [44]:
german_file = r'training-parallel-nc-v9\training\news-commentary-v9.de-en.de'
english_file = r'training-parallel-nc-v9\training\news-commentary-v9.de-en.en'

with open(german_file, 'r', encoding='utf-8') as f:
    german_sentences = f.readlines()

with open(english_file, 'r', encoding='utf-8') as f:
    english_sentences = f.readlines()

In [45]:
df_german = pd.DataFrame(german_sentences, columns=['German'])
df_english = pd.DataFrame(english_sentences, columns=['English'])

In [46]:
df_german.info

<bound method DataFrame.info of                                                    German
0                        Steigt Gold auf 10.000 Dollar?\n
1       SAN FRANCISCO – Es war noch nie leicht, ein ra...
2       In letzter Zeit allerdings ist dies schwierige...
3       Erst letzten Dezember verfassten meine Kollege...
4                     Und es kam, wie es kommen musste.\n
...                                                   ...
201849  Das bleibt eine der größten Errungenschaften i...
201850  Gleichzeitig scheint sich Zumas revolutionäre ...
201851  In einer Region, wo die älteren Menschen sehr ...
201852  Drei von zehn Südafrikanern sind jünger als 15...
201853  Irgendwie muss Zuma einen Weg finden, einersei...

[201854 rows x 1 columns]>

In [47]:
df_english.info

<bound method DataFrame.info of                                                   English
0                                         $10,000 Gold?\n
1       SAN FRANCISCO – It has never been easy to have...
2       Lately, with gold prices up more than 300% ove...
3       Just last December, fellow economists Martin F...
4                                 Wouldn’t you know it?\n
...                                                   ...
201990  Their achievement remains one of the greatest ...
201991  At the same time, Zuma’s revolutionary generat...
201992  In a region that reveres the elderly, Zuma’s a...
201993  Three in ten South Africans are younger than 1...
201994  Somehow Zuma must find a way to honor his own ...

[201995 rows x 1 columns]>

In [48]:
combined_df = pd.concat([df_german, df_english], axis=1)

In [49]:
combined_df = combined_df.drop_duplicates().reset_index(drop=True)

In [51]:
combined_df.info

<bound method DataFrame.info of                                                    German  \
0                        Steigt Gold auf 10.000 Dollar?\n   
1       SAN FRANCISCO – Es war noch nie leicht, ein ra...   
2       In letzter Zeit allerdings ist dies schwierige...   
3       Erst letzten Dezember verfassten meine Kollege...   
4                     Und es kam, wie es kommen musste.\n   
...                                                   ...   
201931                                                NaN   
201932                                                NaN   
201933                                                NaN   
201934                                                NaN   
201935                                                NaN   

                                                  English  
0                                         $10,000 Gold?\n  
1       SAN FRANCISCO – It has never been easy to have...  
2       Lately, with gold prices up more than 300% ove.

In [52]:
combined_df.describe

<bound method NDFrame.describe of                                                    German  \
0                        Steigt Gold auf 10.000 Dollar?\n   
1       SAN FRANCISCO – Es war noch nie leicht, ein ra...   
2       In letzter Zeit allerdings ist dies schwierige...   
3       Erst letzten Dezember verfassten meine Kollege...   
4                     Und es kam, wie es kommen musste.\n   
...                                                   ...   
201931                                                NaN   
201932                                                NaN   
201933                                                NaN   
201934                                                NaN   
201935                                                NaN   

                                                  English  
0                                         $10,000 Gold?\n  
1       SAN FRANCISCO – It has never been easy to have...  
2       Lately, with gold prices up more than 300% ov

In [53]:
combined_df.to_csv(r'C:\Users\Lenovo\Desktop\CapstoneProject\Capstone\combined_dataset.csv', index=False)

In [54]:
combined_df['German'] = combined_df['German'].fillna('')
combined_df['English'] = combined_df['English'].fillna('')

In [55]:
combined_df.info

<bound method DataFrame.info of                                                    German  \
0                        Steigt Gold auf 10.000 Dollar?\n   
1       SAN FRANCISCO – Es war noch nie leicht, ein ra...   
2       In letzter Zeit allerdings ist dies schwierige...   
3       Erst letzten Dezember verfassten meine Kollege...   
4                     Und es kam, wie es kommen musste.\n   
...                                                   ...   
201931                                                      
201932                                                      
201933                                                      
201934                                                      
201935                                                      

                                                  English  
0                                         $10,000 Gold?\n  
1       SAN FRANCISCO – It has never been easy to have...  
2       Lately, with gold prices up more than 300% ove.

In [56]:
def clean_sentence(sentence):
    sentence = sentence.strip()  # Remove leading and trailing white spaces
    sentence = re.sub(r'\s+', ' ', sentence)  # Replace multiple spaces with a single space
    sentence = re.sub(r'[^\w\s.,!?\'"]', '', sentence)  # Remove special characters except common punctuation
    return sentence

In [57]:
combined_df['German'] = combined_df['German'].apply(clean_sentence)
combined_df['English'] = combined_df['English'].apply(clean_sentence)

In [58]:
combined_df['German']

0                            Steigt Gold auf 10.000 Dollar?
1         SAN FRANCISCO  Es war noch nie leicht, ein rat...
2         In letzter Zeit allerdings ist dies schwierige...
3         Erst letzten Dezember verfassten meine Kollege...
4                         Und es kam, wie es kommen musste.
                                ...                        
201931                                                     
201932                                                     
201933                                                     
201934                                                     
201935                                                     
Name: German, Length: 201936, dtype: object

In [59]:
combined_df['English']

0                                              10,000 Gold?
1         SAN FRANCISCO  It has never been easy to have ...
2         Lately, with gold prices up more than 300 over...
3         Just last December, fellow economists Martin F...
4                                      Wouldnt you know it?
                                ...                        
201931    Their achievement remains one of the greatest ...
201932    At the same time, Zumas revolutionary generati...
201933    In a region that reveres the elderly, Zumas at...
201934    Three in ten South Africans are younger than 1...
201935    Somehow Zuma must find a way to honor his own ...
Name: English, Length: 201936, dtype: object

### this line removes the row that are empty

In [60]:
combined_df = combined_df[(combined_df['German'].str.strip() != '') & (combined_df['English'].str.strip() != '')]

### Remove duplicates

In [61]:
combined_df = combined_df.drop_duplicates(subset=['German', 'English']).reset_index(drop=True)

### Ensure sentence alignment

In [62]:
assert len(combined_df['German']) == len(combined_df['English']), "Mismatch in the number of German and English sentences."

In [63]:
combined_df['German'] = combined_df['German'].str.lower()
combined_df['English'] = combined_df['English'].str.lower()

In [64]:
max_length = 200

In [65]:
def word_count(sentence):
    return len(sentence.split())

In [66]:
combined_df = combined_df[combined_df['German'].apply(word_count) <= max_length]
combined_df = combined_df[combined_df['English'].apply(word_count) <= max_length]

In [68]:
combined_df.to_csv(r'C:\Users\Lenovo\Desktop\CapstoneProject\Capstone\cleaned_dataset.csv', index=False)