In [382]:
import pandas as pd
import re
from collections import OrderedDict

In [383]:
content = pd.read_csv('content.csv', encoding='utf-8')
content = content.rename(columns={'Unnamed: 0':'index','name':'title'})
content.shape

(48, 4)

In [384]:
def extract_into_paragraphs(x):
    clean = x.replace('\t','\n').replace('\x0b','\n')
    full = [p.replace('\xa0',' ') for p in clean.split('\n') if p != '']
    return full

In [385]:
content['content_clean'] = content['content'].apply(extract_into_paragraphs)

In [386]:
section_titles = ['.{0,4}[é|e]chauffement.{0,30}$','.{0,4}exercices.{0,30}$', '.{0,4}matchs?.{0,30}$',
                  '.{0,4}sc[è|e]nes.{0,10}$','.{0,4}introduction.{0,10}$','.{0,10}?th[e|è]mes.{0,20}$',
                  '.{0,4}jeux.{0,50}$','.{0,4}?autres.{0,10}$','^théorie$','^début des matchs$']
sections_titles_clean = {section_titles[0]:'Echauffement',
           section_titles[1]:'Exercices',
           section_titles[2]:'Matches',
           section_titles[3]:'Matches',
           section_titles[4]:'remove',
           section_titles[5]:'Thèmes',
           section_titles[6]:'Exercices',
           section_titles[7]:'Matches',
           section_titles[8]:'',
           section_titles[9]: 'Matches'}

In [387]:
content_lengths = [len(x) for x in content['content_clean']]    
content['new_index'] = content.apply(lambda x: sum(content_lengths[0: x['index']]), axis=1)    

In [388]:
def build_section_indexes(x):
    sections = OrderedDict()
    for index, line in enumerate(x['content_clean']):
        for title in section_titles:
            if re.match(title, line.lower()):
                i = x['new_index'] + index
                sections[i] = sections_titles_clean[title]
    return sections

#build_section_indexes(content.iloc[2])
content['sections'] = content.apply(build_section_indexes, axis=1)    

In [389]:
content_long = content.explode('content_clean')

In [390]:
content_long.reset_index(inplace=True)

In [391]:
content_long.shape

(5069, 8)

In [392]:
def assign_categories(x):
    for key in x['sections'].keys():
        if x.name==key:
            return x.name
content_long['categories'] = content_long.apply(assign_categories, axis=1)

In [393]:
content_long = content_long[content_long['content_clean']!=' ']
content_long.shape

(5063, 9)

In [394]:
content_long['categories_index'] = content_long.groupby('index')['categories'].apply(lambda x: x.fillna(method='ffill'))
content_long['categories_index'].fillna(0, inplace=True)
content_long['categories'] = content_long.apply(lambda x: x['sections'][x['categories_index']] if x['categories_index']>0 else '', axis=1)

In [395]:
# Remove category titles in content_clean
def remove_categories_titles(x):
    for key in x['sections'].keys():
        if x.name==key:
            return 'remove'
    return x['categories']
content_long['categories'] = content_long.apply(remove_categories_titles, axis=1)

In [396]:
# Remove lines for non-category details on matches
match_other = ['^type.?:', '^.{0,10}joueurs?.{0,40}$','^th[e|è]me', '^dur[é|e]e','^nature.{0,50}$','^temps','^caucus.{0,40}$']
def remove_extra_matches(x, item):
    if re.match(item, x['content_clean'].lower()):
        if item==match_other[2]:
            return 'Thèmes'
        return 'remove'
    return x['categories']
for item in match_other:
    content_long['categories'] = content_long.apply(remove_extra_matches, item=item, axis=1)

In [397]:
# Save theory or introduction as a separate file
content_final = content_long[(content_long.categories!='') & (content_long.categories!='remove')]
theory = content_long[content_long.categories.isna()]

In [398]:
# Remove duplicates
content_long = content_long.drop_duplicates(subset=['content_clean'])
content_long.shape

(3146, 10)

In [399]:
content_final[['title','content_clean','categories']].to_csv('content_long.csv', encoding='utf-8', index=False)
theory[['title','content_clean']].to_csv('theory.csv', encoding='utf-8', index=False)

In [400]:
content_final.groupby('categories').count()

Unnamed: 0_level_0,level_0,index,title,content,id,content_clean,new_index,sections,categories_index
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Echauffement,748,748,748,748,748,748,748,748,748
Exercices,1113,1113,1113,1113,1113,1113,1113,1113,1113
Matches,1113,1113,1113,1113,1113,1113,1113,1113,1113
Thèmes,483,483,483,483,483,483,483,483,483
