In [1]:
%%capture
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
DATA = Path.cwd().parent / 'data'
SUMM_FOLDER = DATA / 'summaries_finetune'
TEXT_FILES = SUMM_FOLDER / 'text_files_copy'
SOURCE_TEXTS = SUMM_FOLDER / 'source_texts_clean'

In [3]:
p = Path(TEXT_FILES).rglob('*')
file_dict = {'file_path': [x for x in p if x.is_file()]}
file_dict['file_name'] = [str(x).split('/')[-1] for x in file_dict['file_path']]

In [4]:
file_df = pd.DataFrame.from_dict(file_dict).set_index('file_name')

In [5]:
# The original file had 500 empty columns. Don't need those
summaries_df = pd.read_csv(SUMM_FOLDER / 'final_summaries_ai_aloe.csv').iloc[:,0:14]

# Here's a function to get the text. I'm ignoring encoding errors for now, but I need to figure that out.
def getText(filename):
    if filename in file_df.index:
        with open(file_df.loc[filename, 'file_path'], 'r',encoding='utf-8', errors='ignore') as f:
            text = f.read()
        return text
    else:
        return np.nan

# Apply the function
summaries_df['text'] = summaries_df['filename'].apply(lambda x: getText(x))

In [13]:
for row in summaries_df.iterrows():
    with open(SUMM_FOLDER / ('summaries/' + row[1]['filename_clean'] + '.txt'), 'w') as f:
        f.write(row[1]['text'])

## Get the Source Texts in there

In [8]:
p = Path(SOURCE_TEXTS).rglob('*')
file_dict = {'file_path': [x for x in p if x.is_file()]}
file_dict['file_name'] = [str(x).split('/')[-1] for x in file_dict['file_path']]
file_df = pd.DataFrame.from_dict(file_dict)
file_df['source_text_clean'] = file_df['file_name'].apply(lambda x: x.split('.')[0])
file_df = file_df.set_index('source_text_clean')

In [9]:
summaries_df['source_text_clean'] = summaries_df['source_text'].apply(lambda x: x.split('.')[0])

### The prompt names in the csv are wrong. Changing them.

In [10]:
prompt_names_df = pd.read_csv('prompt_names.csv').set_index('csvname_clean')
prompt_names_df.loc['Hybrids', 'filename_clean']

'24_Hybrids.txt'

In [11]:
summaries_df['source_text_filename_clean'] = summaries_df['source_text_clean'].apply(lambda x: prompt_names_df.loc[x, 'filename_clean'])
summaries_df['source_text_filename_clean'] = summaries_df['source_text_filename_clean'].apply(lambda x: '24_Hybrids' if x == '24_Hybrids.txt' else x)


In [12]:
# Fix the sun exposure text
def fixSunSource(row):
    filename_list = row['filename'].split('_')
    if filename_list[0] == 'MDSum':
        text_num = filename_list[-1].split('.')[0]
        if text_num =='Text1':
            return 'Sun_exposure_Text1'
        elif text_num == 'Text2':
            return 'Sun_exposure_Text2'
        elif text_num == 'Text3':
            return 'Sun_exposure_Text3'
        elif text_num == 'Text4':
            return 'Sun_exposure_Text4'
        elif text_num == 'Text5':
            return 'Sun_exposure_Text5'
        else:
            return row['source_text_filename_clean']
    else:
        return row['source_text_filename_clean']

summaries_df['source_text_filename_clean'] = summaries_df.apply(lambda x: fixSunSource(x), axis=1)
summaries_df

Unnamed: 0.1,Unnamed: 0,Row.names,filename,filename_clean,source_text,Main.Point,Details,Cohesion,Objective.Language,Wording.Para,Lang..Bey..ST,Summ..Length,content_pca,paraphrase_pca,text,source_text_clean,source_text_filename_clean
0,1,1,1091_CivilServices .txt,1091_CivilServices,CivilService.txt,0.5,0.5,0.5,0.5,0.5,0.5,0.5,1.37,0.785,hard work pays off / / \n,CivilService,11_CivilService
1,2,10,455436_Hybrids .txt,455436_Hybrids,Hybrids.txt,0.5,0.5,0.5,0.5,0.0,0.5,0.5,1.37,0.245,There will be a more petroleum product coming...,Hybrids,24_Hybrids
2,3,100,355927_SexDetermination .txt,355927_SexDetermination,Sex Determination.txt,1.0,2.0,3.0,1.5,0.5,1.0,1.0,5.60,1.030,Males have one Y chromosome and one X chromoso...,Sex Determination,Sex Determination
3,4,1000,MDSum_A_Sum_037_Text1.txt.txt,MDSum_A_Sum_037_Text1,sun_exposure,2.5,2.0,1.5,2.0,2.5,1.5,1.0,5.42,3.435,Electromagnetic radiation with an energy and s...,sun_exposure,Sun_exposure_Text1
4,5,1001,MDSum_A_Sum_037_Text2.txt.txt,MDSum_A_Sum_037_Text2,sun_exposure,2.5,2.0,2.0,2.5,3.0,2.0,1.0,5.86,4.220,It is highly recommended by doctors that peopl...,sun_exposure,Sun_exposure_Text2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4685,4686,995,MDSum_A_Sum_029_Text2.txt.txt,MDSum_A_Sum_029_Text2,sun_exposure,2.5,2.0,2.5,2.5,1.0,2.0,2.5,6.30,2.060,The results from many studies indicate that vi...,sun_exposure,Sun_exposure_Text2
4686,4687,996,MDSum_A_Sum_029_Text4.txt.txt,MDSum_A_Sum_029_Text4,sun_exposure,2.5,2.0,3.0,2.0,1.0,2.0,1.5,6.74,2.060,People are being advised to spend less time in...,sun_exposure,Sun_exposure_Text4
4687,4688,997,MDSum_A_Sum_029_Text5.txt.txt,MDSum_A_Sum_029_Text5,sun_exposure,2.5,2.5,2.0,2.5,1.0,2.0,1.0,6.41,2.060,We are thus in a situation where people are re...,sun_exposure,Sun_exposure_Text5
4688,4689,998,MDSum_A_Sum_031_Text3.txt.txt,MDSum_A_Sum_031_Text3,sun_exposure,2.5,3.0,3.5,2.0,2.5,2.5,3.0,8.28,3.925,"There are two types of cancers, melanoma and b...",sun_exposure,Sun_exposure_Text3


In [15]:
def getText(filename):
    if filename in file_df.index:
        with open(file_df.loc[filename, 'file_path'], 'r',encoding='utf-8', errors='ignore') as f:
            text = f.read()
        return text
    else:
        return np.nan
    
summaries_df['source'] = summaries_df['source_text_filename_clean'].apply(lambda x: getText(x))
summaries_df[summaries_df['source'].isna()]
summaries_df

Unnamed: 0.1,Unnamed: 0,Row.names,filename,filename_clean,source_text,Main.Point,Details,Cohesion,Objective.Language,Wording.Para,Lang..Bey..ST,Summ..Length,content_pca,paraphrase_pca,text,source_text_clean,source_text_filename_clean,source
0,1,1,1091_CivilServices .txt,1091_CivilServices,CivilService.txt,0.5,0.5,0.5,0.5,0.5,0.5,0.5,1.37,0.785,hard work pays off / / \n,CivilService,11_CivilService,\nCivil service offers jobs to thousands of me...
1,2,10,455436_Hybrids .txt,455436_Hybrids,Hybrids.txt,0.5,0.5,0.5,0.5,0.0,0.5,0.5,1.37,0.245,There will be a more petroleum product coming...,Hybrids,24_Hybrids,\nGasoline prices are getting higher and highe...
2,3,100,355927_SexDetermination .txt,355927_SexDetermination,Sex Determination.txt,1.0,2.0,3.0,1.5,0.5,1.0,1.0,5.60,1.030,Males have one Y chromosome and one X chromoso...,Sex Determination,Sex Determination,Mode of Inheritance: Sex Determination\nRecall...
3,4,1000,MDSum_A_Sum_037_Text1.txt.txt,MDSum_A_Sum_037_Text1,sun_exposure,2.5,2.0,1.5,2.0,2.5,1.5,1.0,5.42,3.435,Electromagnetic radiation with an energy and s...,sun_exposure,Sun_exposure_Text1,Ultraviolet radiation\n\nUltraviolet radiation...
4,5,1001,MDSum_A_Sum_037_Text2.txt.txt,MDSum_A_Sum_037_Text2,sun_exposure,2.5,2.0,2.0,2.5,3.0,2.0,1.0,5.86,4.220,It is highly recommended by doctors that peopl...,sun_exposure,Sun_exposure_Text2,Research indicates that sufficient amounts of ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4685,4686,995,MDSum_A_Sum_029_Text2.txt.txt,MDSum_A_Sum_029_Text2,sun_exposure,2.5,2.0,2.5,2.5,1.0,2.0,2.5,6.30,2.060,The results from many studies indicate that vi...,sun_exposure,Sun_exposure_Text2,Research indicates that sufficient amounts of ...
4686,4687,996,MDSum_A_Sum_029_Text4.txt.txt,MDSum_A_Sum_029_Text4,sun_exposure,2.5,2.0,3.0,2.0,1.0,2.0,1.5,6.74,2.060,People are being advised to spend less time in...,sun_exposure,Sun_exposure_Text4,The sun counteracts cancer\n\nSunrays that hit...
4687,4688,997,MDSum_A_Sum_029_Text5.txt.txt,MDSum_A_Sum_029_Text5,sun_exposure,2.5,2.5,2.0,2.5,1.0,2.0,1.0,6.41,2.060,We are thus in a situation where people are re...,sun_exposure,Sun_exposure_Text5,"Sunbathing causes cancer \n\nBy Alex McFadden,..."
4688,4689,998,MDSum_A_Sum_031_Text3.txt.txt,MDSum_A_Sum_031_Text3,sun_exposure,2.5,3.0,3.5,2.0,2.5,2.5,3.0,8.28,3.925,"There are two types of cancers, melanoma and b...",sun_exposure,Sun_exposure_Text3,American Cancer Society\n\n \n\nUV radiation a...


In [17]:
summaries_df.to_csv(SUMM_FOLDER / 'final_summaries_ai_aloe_fixed.csv')

In [18]:
summaries_df = pd.read_csv('final_summaries_ai_aloe_fixed.csv', index_col=None)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Row.names,filename,filename_clean,source_text,Main.Point,Details,Cohesion,Objective.Language,Wording.Para,Lang..Bey..ST,Summ..Length,content_pca,paraphrase_pca,text,source_text_clean,source_text_filename_clean,source
0,0,1,1,1091_CivilServices .txt,1091_CivilServices,CivilService.txt,0.5,0.5,0.5,0.5,0.5,0.5,0.5,1.37,0.785,hard work pays off / / \n,CivilService,11_CivilService,\nCivil service offers jobs to thousands of me...
1,1,2,10,455436_Hybrids .txt,455436_Hybrids,Hybrids.txt,0.5,0.5,0.5,0.5,0.0,0.5,0.5,1.37,0.245,There will be a more petroleum product coming...,Hybrids,24_Hybrids,\nGasoline prices are getting higher and highe...
2,2,3,100,355927_SexDetermination .txt,355927_SexDetermination,Sex Determination.txt,1.0,2.0,3.0,1.5,0.5,1.0,1.0,5.60,1.030,Males have one Y chromosome and one X chromoso...,Sex Determination,Sex Determination,Mode of Inheritance: Sex Determination\nRecall...
3,3,4,1000,MDSum_A_Sum_037_Text1.txt.txt,MDSum_A_Sum_037_Text1,sun_exposure,2.5,2.0,1.5,2.0,2.5,1.5,1.0,5.42,3.435,Electromagnetic radiation with an energy and s...,sun_exposure,Sun_exposure_Text1,Text 1\n\n\n\nDifferent types of radiation\n\n...
4,4,5,1001,MDSum_A_Sum_037_Text2.txt.txt,MDSum_A_Sum_037_Text2,sun_exposure,2.5,2.0,2.0,2.5,3.0,2.0,1.0,5.86,4.220,It is highly recommended by doctors that peopl...,sun_exposure,Sun_exposure_Text2,Text 1\n\n\n\nDifferent types of radiation\n\n...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4685,4685,4686,995,MDSum_A_Sum_029_Text2.txt.txt,MDSum_A_Sum_029_Text2,sun_exposure,2.5,2.0,2.5,2.5,1.0,2.0,2.5,6.30,2.060,The results from many studies indicate that vi...,sun_exposure,Sun_exposure_Text2,Text 1\n\n\n\nDifferent types of radiation\n\n...
4686,4686,4687,996,MDSum_A_Sum_029_Text4.txt.txt,MDSum_A_Sum_029_Text4,sun_exposure,2.5,2.0,3.0,2.0,1.0,2.0,1.5,6.74,2.060,People are being advised to spend less time in...,sun_exposure,Sun_exposure_Text4,Text 1\n\n\n\nDifferent types of radiation\n\n...
4687,4687,4688,997,MDSum_A_Sum_029_Text5.txt.txt,MDSum_A_Sum_029_Text5,sun_exposure,2.5,2.5,2.0,2.5,1.0,2.0,1.0,6.41,2.060,We are thus in a situation where people are re...,sun_exposure,Sun_exposure_Text5,Text 1\n\n\n\nDifferent types of radiation\n\n...
4688,4688,4689,998,MDSum_A_Sum_031_Text3.txt.txt,MDSum_A_Sum_031_Text3,sun_exposure,2.5,3.0,3.5,2.0,2.5,2.5,3.0,8.28,3.925,"There are two types of cancers, melanoma and b...",sun_exposure,Sun_exposure_Text3,Text 1\n\n\n\nDifferent types of radiation\n\n...
