# Document importing and cleaning

In [1]:
# importing required modules
import nltk
from nltk.corpus import stopwords
import glob
import re
import os
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas as pd
import docx2txt
import textract
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import string
from docx.api import Document
import time
import torch

#pandas max columns and rows
pd.set_option('display.max_colwidth', None)
pd.set_option("display.max_rows", None)

In [2]:
start_time = time.time()

In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using device: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("No GPU available, using CPU instead.")

No GPU available, using CPU instead.


In [4]:
# Find the current working directory
OCD_df=  pd.read_csv("C:/Users/me1awq/PhD/docsim/datasets/rssb/OCD rulebook.csv", header=None)
OCD_df.columns = ["Source_rules_document"]
OCD_df

Unnamed: 0,Source_rules_document
0,2.1 Separation of trains
1,Principle: The method of signalling must maintain a space interval between trains that is safe.
2,2.1.1 Context
3,"2.1.1.1 The aim of this principle is to reduce the risk of collision between trains, by establishing a spatial envelope, or interval, around each train. This is normally achieved and maintained by train control and signalling systems, based on one train in one section of line at a time; it applies to train movements on single, double or multiple-track lines."
4,"2.1.1.2 Train control and signalling systems are designed to make the space interval effective against collision with train movements from converging lines, as well as following and opposing movements of trains on the same line."
5,"2.1.1.3 The principle encompasses situations in which, for operational reasons, the space interval between two trains is reduced or eliminated, either as part of regular planned operation (permissive working) or when it is necessary to use a train to assist another train which has failed. It does not include movements solely for the purpose of coupling or uncoupling."
6,2.1.1.4 Operating procedures using written or spoken instructions are applied when the signalling system cannot be used to preserve the effectiveness of the space interval:
7,a) for movements in the opposite direction to that for which the line is signalled
8,"b) when the functionality of the system is partially or totally degraded, due to failure or planned disconnection."
9,2.1.1.5 Authority for train movements under normal and degraded conditions is discussed in section 2.2 of this operational concept document.


In [5]:
# Find the current working directory
EA_df=  pd.read_csv("C:/Users/me1awq/PhD/docsim/datasets/rssb/EA rulebook.csv", header=None, encoding='cp1252')
EA_df.columns = ["Target_rules_document"]
EA_df

Unnamed: 0,Target_rules_document
0,Driver
1,Driver
2,Driver
3,IM
4,IM
5,IM
6,IM
7,Maintenance
8,Maintenance
9,Maintenance


In [6]:
# main_df= OCD_df.append(EA_df, ignore_index=True)
# main_df.columns=['Full_document_rules']
# main_df

In [7]:
# # create a new column by merging columns A and B
# main_df['Full_document_rules'] = OCD_df['Source_rules_document'] + EA_df['Target_rules_document']
# print(main_df)

In [14]:
main_df = pd.concat([OCD_df, EA_df])
# create a new data frame with one column
# df_new = main_df.melt(value_vars=main_df.columns)
# main_df.columns=['Full_document_rules']
# result = result.stack().reset_index(drop=True)
# print the merged dataframe
main_df

Unnamed: 0,Source_rules_document,Target_rules_document
0,2.1 Separation of trains,
1,Principle: The method of signalling must maintain a space interval between trains that is safe.,
2,2.1.1 Context,
3,"2.1.1.1 The aim of this principle is to reduce the risk of collision between trains, by establishing a spatial envelope, or interval, around each train. This is normally achieved and maintained by train control and signalling systems, based on one train in one section of line at a time; it applies to train movements on single, double or multiple-track lines.",
4,"2.1.1.2 Train control and signalling systems are designed to make the space interval effective against collision with train movements from converging lines, as well as following and opposing movements of trains on the same line.",
5,"2.1.1.3 The principle encompasses situations in which, for operational reasons, the space interval between two trains is reduced or eliminated, either as part of regular planned operation (permissive working) or when it is necessary to use a train to assist another train which has failed. It does not include movements solely for the purpose of coupling or uncoupling.",
6,2.1.1.4 Operating procedures using written or spoken instructions are applied when the signalling system cannot be used to preserve the effectiveness of the space interval:,
7,a) for movements in the opposite direction to that for which the line is signalled,
8,"b) when the functionality of the system is partially or totally degraded, due to failure or planned disconnection.",
9,2.1.1.5 Authority for train movements under normal and degraded conditions is discussed in section 2.2 of this operational concept document.,


In [None]:
# Import stopwords with nltk.
from nltk.corpus import stopwords
stop = stopwords.words('english')

custom_stopwords = ["â€“","â€™","a)","–","–","b)","c)","d)","e)",":","(",")","â€˜","-"]
# punctation=[":","(",")"]
stop.extend(custom_stopwords)
# stop.extend(punctation)

In [None]:
main_df['Stop_words_removal_documents_rules']= main_df['Full_document_rules'].str.lower()
main_df

In [None]:
main_df['Clean_documents_rules'] = main_df['Stop_words_removal_documents_rules'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
main_df

In [None]:
main_df['Word_tokenize_rules'] = main_df['Clean_documents_rules'].apply(word_tokenize)
main_df

# Wordcloud

In [None]:
#world cloud
allwords= ''.join( [rpt for rpt in main_df['Clean_documents_rules']] )
wordCloud = WordCloud(background_color="black",width =1600, height=900, random_state =16, 
                      max_font_size= 150, min_word_length=3,max_words=150,stopwords=stop).generate(allwords)

plt.figure(figsize = (20,15), dpi=800)
plt.imshow(wordCloud, interpolation = 'bilinear')
plt.axis('off')
plt.show()

In [None]:
end_time = time.time()

# calculate the total time taken to execute the code on the CPU
total_time = end_time - start_time

print(f"Total time taken to execute the code on CPU: {total_time:.6f} seconds")

In [15]:
import pandas as pd

# create two single column dataframes
df1 = pd.DataFrame({'A': [1, 2, 3]})
df2 = pd.DataFrame({'B': [4, 5, 6]})

# concatenate the dataframes along the column axis (axis=1)
merged_df = pd.concat([df1, df2], axis=1)

# rename the resulting column to a single name
merged_df.columns = ['C']

# drop any null values that may have been created during the merge
merged_df = merged_df.dropna()

# display the resulting dataframe
print(merged_df)

ValueError: Length mismatch: Expected axis has 2 elements, new values have 1 elements