In [1]:
import itertools
import re
import pandas as pd
import numpy as np
import os

# Defining functions to get w-2, w-1, w+1, w+2, w−2%w−1, w−1%w+1, w+1%w+2, class.

Special characters are considered seperate words.

In [2]:
def clean_data(df):
    
    #Stripping the quotes of instance id.
    df['instance_id'] = df['instance_id'].str.strip('"')
    
    #Changing values stored as <head>value<\head> to value.
    
    df = df.replace('<head>', '', regex = True)
    
    df = df.replace('</head>', '', regex = True)
    
    #Replacing ,(comma)
    df = df.replace(',', 'C', regex=True)
    
    #Replacing Nan, if any, with empty string.
    df = df.fillna("")
    
    return df

In [3]:
def get_matching_string(infile, tempfile, outfile):
    
    
    #Stripping multiple spaces into single space. This means that paragraph information will be lost. 
    #Only way to maintain paragraph information is to split by instance.
    
    fin = open(infile, "rt")
    fout = open(tempfile, "wt")

    for line in fin:
        fout.write(' '.join(line.split()))

    fin.close()
    fout.close()
    
    #Reading the file instance by instance.
    
    with open(tempfile, 'r') as f:
        text = f.read()
        text = text.split("<instance id=")
    
    fout = open(outfile, 'w')
    
    for i in text:
        
        #Matching all occurences of the word in that instance.
    
        matches = re.finditer('(\S+\s){0,2}(<head>\w+</head>)+(\s\S+){0,2}', i)
        
        #The first string will be the instance id as we are splitting at instance id.
        
        match_id = re.search('"\S+"', i)
    
        result = match_id.group(0) if match_id else ""
    
        for m in matches:
             fout.write(result + " " + m.group(0) + '\n')
    
    fout.close()
    
    #Cleaning using pandas dataframe.
    
    df = pd.read_csv(outfile, names=['instance_id','w-2','w-1','w','w+1', 'w+2'], delimiter = ' ')
     
    df = df.drop('w', 1)
    
    df = clean_data(df)
    
    #w−2%w−1 w−1%w+1 w+1%w+2
    
    df['w-2%w-1'] = df.apply(lambda row: row['w-2'] + '%' + row['w-1'], axis=1)
    df['w-1%w+1'] = df.apply(lambda row: row['w-1'] + '%' + row['w+1'], axis=1)
    df['w+1%w+2'] = df.apply(lambda row: row['w+1'] + '%' + row['w+2'], axis=1)
    
    return df

In [4]:
def get_key(keyfile, key):
    
    #Getting the keys from the file.
    df = pd.read_csv(keyfile, names=['name', 'instance_id', 'senseid', 'senseid2'], delimiter = ' ')
    
    #Taking out all ids for the particular key.
    
    df = df.loc[df['name'] == key]
    
#     #Creating two instances, wherever there are 2 senseids.
#     df = df.sort_index().reset_index(drop=True)
#     d = df.dropna(subset=['senseid2'])
    
#     #counter.
#     c = 0
#     for i in d.index:
        
#         row = df.loc[i + c]
#         df.loc[i+0.5] = row['name'], row['instance_id'], row['senseid2'], np.nan
        
#         df = df.sort_index().reset_index(drop=True)
#         c += 1

        
    #Dropping senseid2. #Considering only the first senseid.
    
    df = df.drop('senseid2', 1)

    return df

# Defining a function to extract all the examples of the given words. 

Code to extract infromation between two word instances was taken from: "https://stackoverflow.com/questions/18865058/extract-values-between-two-strings-in-a-text-file-using-python"

In [5]:
def extract_content(infile, outfile, start, end, featurefile, tempfile, keyfile, key):
    
    #Dividing our data accroding to the given name. Creating seperate files for each name
    with open(infile, 'r') as f, open(outfile, 'w') as fout:
        while True:
            it = itertools.dropwhile(lambda line: line.strip() != start, f)
            if next(it, None) is None: break
            fout.writelines(itertools.takewhile(lambda line: line.strip() != end, it))
            
    #Getting the features and the keys. 
    
    df1 = get_matching_string(outfile, tempfile, featurefile)
    df2 = get_key(keyfile, key)
    
    #Joining two dataframes on instance id.
    
    df = pd.merge(df1, df2, on = 'instance_id')
    
    #Dropping unnecessary columns.
    
    df = df.drop(['name', 'instance_id'], 1)
    
    return df

# Defining all the filenames

In [6]:
infile1 = 'EnglishLS.train/EnglishLS.train'
infile2 = 'EnglishLS.test/EnglishLS.test'


outfile1 = 'arms_train.txt'
outfile2 = 'difficulty_train.txt'
outfile3 = 'interest_train.txt'
outfile4 = 'arms_test.txt'
outfile5 = 'difficulty_test.txt'
outfile6 = 'interest_test.txt'


start1 = '<lexelt item="arm.n">'
start2 = '<lexelt item="difficulty.n">'
start3 = '<lexelt item="interest.n">'


end = '</lexelt>'


tempfile1 = 'arms_train_features_temp.txt'
tempfile2 = 'difficulty_train_features_temp.txt'
tempfile3 = 'interest_train_features_temp.txt'

tempfile4 = 'arms_test_features_temp.txt'
tempfile5 = 'difficulty_test_features_temp.txt'
tempfile6 = 'interest_test_features_temp.txt'


featurefile1 = 'arms_train_features.txt'
featurefile2 = 'difficulty_train_features.txt'
featurefile3 = 'interest_train_features.txt'

featurefile4 = 'arms_test_features.txt'
featurefile5 = 'difficulty_test_features.txt'
featurefile6 = 'interest_test_features.txt'

keyfile1 = 'EnglishLS.train/EnglishLS.train.key'
keyfile2 = 'EnglishLS.test/EnglishLS.test.key'

key1 = 'arm.n'
key2 = 'difficulty.n'
key3 = 'interest.n'

finalfile1 = 'arms_train_final.txt'
finalfile2 = 'difficulty_train_final.txt'
finalfile3 = 'interest_train_final.txt'

finalfile4 = 'arms_test_final.txt'
finalfile5 = 'difficulty_test_final.txt'
finalfile6 = 'interest_test_final.txt'

# Calling the functions on all 3 training and testing data.

In [7]:
df1 = extract_content(infile1, outfile1, start1, end, featurefile1, tempfile1, keyfile1, key1)
df2 = extract_content(infile1, outfile2, start2, end, featurefile2, tempfile2, keyfile1, key2)
df3 = extract_content(infile1, outfile3, start3, end, featurefile3, tempfile3, keyfile1, key3)

In [8]:
df4 = extract_content(infile2, outfile4, start1, end, featurefile4, tempfile4, keyfile2, key1)
df5 = extract_content(infile2, outfile5, start2, end, featurefile5, tempfile5, keyfile2, key2)
df6 = extract_content(infile2, outfile6, start3, end, featurefile6, tempfile6, keyfile2, key3)

In [9]:
df1

Unnamed: 0,w-2,w-1,w+1,w+2,w-2%w-1,w-1%w+1,w+1%w+2,senseid
0,with,their,chained,to,with%their,their%chained,chained%to,arm%1:08:00::
1,under,his,and,come,under%his,his%and,and%come,arm%1:08:00::
2,raised,his,involuntarily,C,raised%his,his%involuntarily,involuntarily%C,arm%1:08:00::
3,under,your,and,wishing,under%your,your%and,and%wishing,arm%1:08:00::
4,searching,for,.,This,searching%for,for%.,.%This,arm%1:06:01::
...,...,...,...,...,...,...,...,...
269,losing,one,or,one,losing%one,one%or,or%one,arm%1:08:00::
270,lost,an,or,leg,lost%an,an%or,or%leg,arm%1:08:00::
271,C,his,by,his,C%his,his%by,by%his,arm%1:08:00::
272,pulled,my,out,there,pulled%my,my%out,out%there,arm%1:08:00::


In [10]:
df2

Unnamed: 0,w-2,w-1,w+1,w+2,w-2%w-1,w-1%w+1,w+1%w+2,senseid
0,of,the,is,that,of%the,the%is,is%that,difficulty%1:26:00::
1,I,have,in,continuing,I%have,have%in,in%continuing,difficulty%1:04:00::
2,can,produce,in,the,can%produce,produce%in,in%the,difficulty%1:09:02::
3,although,the,include,the,although%the,the%include,include%the,difficulty%1:26:00::
4,in,serious,about,knowing,in%serious,serious%about,about%knowing,difficulty%1:09:02::
5,may,have,in,deciding,may%have,have%in,in%deciding,difficulty%1:04:00::
6,most,obvious,is,that,most%obvious,obvious%is,is%that,difficulty%1:07:00::
7,performance,pose,for,the,performance%pose,pose%for,for%the,difficulty%1:09:02::
8,C,the,are,complicated,C%the,the%are,are%complicated,difficulty%1:09:02::
9,note,about,of,attribution,note%about,about%of,of%attribution,difficulty%1:09:02::


In [11]:
df3

Unnamed: 0,w-2,w-1,w+1,w+2,w-2%w-1,w-1%w+1,w+1%w+2,senseid
0,Serving,the,of,Haemophiliacs,Serving%the,the%of,of%Haemophiliacs,interest%1:07:01::
1,and,the,of,clients,and%the,the%of,of%clients,interest%1:07:01::
2,and,shared,C,not,and%shared,shared%C,C%not,interest%1:04:01::
3,building,society,on,which,building%society,society%on,on%which,interest%1:21:00::
4,building,society,on,which,building%society,society%on,on%which,interest%1:21:00::
...,...,...,...,...,...,...,...,...
183,shown,increasing,in,it,shown%increasing,increasing%in,in%it,interest%1:09:00::
184,one,'s,to,have,one%'s,'s%to,to%have,interest%1:07:01::
185,the,public,.,The,the%public,public%.,.%The,interest%1:07:01::
186,only,of,to,three,only%of,of%to,to%three,interest%1:07:02::


In [12]:
df4

Unnamed: 0,w-2,w-1,w+1,w+2,w-2%w-1,w-1%w+1,w+1%w+2,senseid
0,with,six,as,Siva,with%six,six%as,as%Siva,arm%1:08:00::
1,all,his,C,And,all%his,his%C,C%And,arm%1:08:00::
2,of,folded,C,The,of%folded,folded%C,C%The,arm%1:08:00::
3,above,the,of,the,above%the,the%of,of%the,arm%1:06:03::
4,the,sparging,at,the,the%sparging,sparging%at,at%the,U
...,...,...,...,...,...,...,...,...
128,side,-,C,sometimes,side%-,-%C,C%sometimes,arm%1:06:01::
129,waving,his,about,and,waving%his,his%about,about%and,arm%1:08:00::
130,drawing,his,in,a,drawing%his,his%in,in%a,arm%1:08:00::
131,washed,my,in,the,washed%my,my%in,in%the,arm%1:08:00::


In [13]:
df5

Unnamed: 0,w-2,w-1,w+1,w+2,w-2%w-1,w-1%w+1,w+1%w+2,senseid
0,produce,great,in,the,produce%great,great%in,in%the,difficulty%1:04:00::
1,encounter,many,with,our,encounter%many,many%with,with%our,difficulty%1:09:02::
2,of,her,is,to,of%her,her%is,is%to,difficulty%1:07:00::
3,find,more,in,following,find%more,more%in,in%following,difficulty%1:04:00::
4,seldom,without,),when,seldom%without,without%),)%when,difficulty%1:26:00::
5,:,his,is,assessing,:%his,his%is,is%assessing,difficulty%1:09:02::
6,The,first,is,C,The%first,first%is,is%C,difficulty%1:09:02::
7,.,One,in,describing,.%One,One%in,in%describing,difficulty%1:09:02::
8,constantly,presented,to,critics,constantly%presented,presented%to,to%critics,difficulty%1:09:02::
9,some,tormenting,C,His,some%tormenting,tormenting%C,C%His,difficulty%1:26:00::


In [14]:
df6

Unnamed: 0,w-2,w-1,w+1,w+2,w-2%w-1,w-1%w+1,w+1%w+2,senseid
0,C,mortgage,relief,C,C%mortgage,mortgage%relief,relief%C,interest%1:21:00::
1,C,economic,and,religious,C%economic,economic%and,and%religious,interest%1:07:01::
2,.,Her,has,been,.%Her,Her%has,has%been,interest%1:04:01::
3,and,persistent,which,human,and%persistent,persistent%which,which%human,interest%1:09:00::
4,indicated,an,in,the,indicated%an,an%in,in%the,interest%1:09:00::
...,...,...,...,...,...,...,...,...
88,the,public,for,challenges,the%public,public%for,for%challenges,interest%1:07:01::
89,serve,the,of,the,serve%the,the%of,of%the,interest%1:07:01::
90,a,financial,in,United,a%financial,financial%in,in%United,interest%1:21:03::
91,a,financial,in,another,a%financial,financial%in,in%another,interest%1:21:03::


# Saving final training and testing files.

In [15]:
df1.to_csv(finalfile1, sep=',', index=False, header = False)

In [16]:
df2.to_csv(finalfile2, sep=',', index=False, header = False)

In [17]:
df3.to_csv(finalfile3, sep=',', index=False, header = False)

In [18]:
df4.to_csv(finalfile4, sep=',', index=False, header = False)

In [19]:
df5.to_csv(finalfile5, sep=',', index=False, header = False)

In [20]:
df6.to_csv(finalfile6, sep=',', index=False, header = False)

# Removing the temporary files created.

In [21]:
os.remove(outfile1)
os.remove(outfile2)
os.remove(outfile3)

os.remove(outfile4)
os.remove(outfile5)
os.remove(outfile6)

os.remove(featurefile1)
os.remove(featurefile2)
os.remove(featurefile3)

os.remove(featurefile4)
os.remove(featurefile5)
os.remove(featurefile6)

os.remove(tempfile1)
os.remove(tempfile2)
os.remove(tempfile3)

os.remove(tempfile4)
os.remove(tempfile5)
os.remove(tempfile6)