# Reading Data from Docx

In this jupyter notebook i will read the data from the docx file and convert it into a dataframe. Then we will store it into a csv.

## Importing Libraries

In [2]:
from docx import Document
import json
import os
import numpy as np
import pandas as pd

## Helper functions

In [3]:
def read_docx_json(file_path):
    doc = Document(file_path)
    full_text = "\n".join([para.text for para in doc.paragraphs])
    try:
        data = json.loads(full_text)
        return data
    except json.JSONDecodeError as e:
        print("Error decoding JSON:", e)
        return None

In [4]:
def process_folder(folder_path):
    all_data = []
    for i,file_name in enumerate(os.listdir(folder_path)):
        if file_name.endswith(".docx"):
            file_path = os.path.join(folder_path, file_name)
            print(f"Processing file: {file_name}")
            data = read_docx_json(file_path)
            if data:
                if isinstance(data, list):
                    all_data.extend(data)
                else:
                    print(f"File {file_name} does not contain a list; skipping.")
    return all_data

In [5]:
def process_parsed_data(parsed_data):
    records = []
    for i in range(len(parsed_data)):
        type_ = parsed_data[i].get('type',np.nan)
        status_ = parsed_data[i].get('status',np.nan)    
        try:
            departure_dict = parsed_data[i]['departure']
            departure_iata_code = departure_dict.get('iataCode', np.nan)
            departure_icao_code = departure_dict.get('icaoCode', np.nan)
            departure_terminal = departure_dict.get('terminal', np.nan)
            departure_gate = departure_dict.get('gate', np.nan)
            departure_scheduled_time = departure_dict.get('scheduledTime', np.nan)
            departure_estimated_time = departure_dict.get('estimatedTime', np.nan)
            departure_actual_time = departure_dict.get('actualTime', np.nan)
            departure_estimated_runway = departure_dict.get('estimatedRunway', np.nan)
            departure_actual_runway = departure_dict.get('actualRunway', np.nan)
        except:
            departure_iata_code = np.nan
            departure_icao_code = np.nan
            departure_terminal = np.nan
            departure_gate = np.nan
            departure_scheduled_time = np.nan
            departure_estimated_time = np.nan
            departure_actual_time = np.nan
            departure_estimated_runway = np.nan
            departure_actual_runway = np.nan
        try:
            arrival_dict = parsed_data[i]['arrival']
            arrival_iata_code = arrival_dict.get('iataCode', np.nan)
            arrival_icao_code = arrival_dict.get('icaoCode', np.nan)
            arrival_terminal = arrival_dict.get('terminal', np.nan)
            arrival_scheduled_time = arrival_dict.get('scheduledTime', np.nan)
            arrival_estimated_time = arrival_dict.get('estimatedTime', np.nan)
            arrival_actual_time = arrival_dict.get('actualTime', np.nan)
            arrival_estimated_runway = arrival_dict.get('estimatedRunway', np.nan)
            arrival_actual_runway = arrival_dict.get('actualRunway', np.nan)
        except:
            arrival_iata_code = np.nan
            arrival_icao_code = np.nan
            arrival_terminal = np.nan
            arrival_scheduled_time = np.nan
            arrival_estimated_time = np.nan
            arrival_actual_time = np.nan
            arrival_estimated_runway = np.nan
            arrival_actual_runway = np.nan
        try:
            airline_dict = parsed_data[i]['airline']
            airline_name = airline_dict.get('name',np.nan)
            airline_iata_code = airline_dict.get('iataCode',np.nan)
            airline_icao_code = airline_dict.get('icaoCode',np.nan)
        except:
            airline_name = np.nan
            airline_iata_code = np.nan
            airline_icao_code = np.nan
        try:
            flight_dict = parsed_data[i]['flight']
            flight_number = flight_dict.get('number',np.nan)
            flight_iataNumber = flight_dict.get('iataNumber',np.nan)
            flight_icaoNumber = flight_dict.get('icaoNumber',np.nan)
        except:
            flight_number = np.nan
            flight_iataNumber = np.nan
            flight_icaoNumber = np.nan
        try:
            code_shared_dict = parsed_data[i]['codeshared']
            code_shared_flag = 1
            code_shared_airline = code_shared_dict.get('airline', {}).get('name', np.nan)
        except:
            code_shared_flag = 0
            code_shared_airline = np.nan
        record = {
            "type": type_,
            "status": status_,
            "departure_iata_code": departure_iata_code,
            "departure_icao_code": departure_icao_code,
            "departure_terminal": departure_terminal,
            "departure_gate": departure_gate,
            "departure_scheduled_time": departure_scheduled_time,
            "departure_estimated_time": departure_estimated_time,
            "departure_actual_time": departure_actual_time,
            "departure_estimated_runway": departure_estimated_runway,
            "departure_actual_runway": departure_actual_runway,
            "arrival_iata_code": arrival_iata_code,
            "arrival_icao_code": arrival_icao_code,
            "arrival_terminal": arrival_terminal,
            "arrival_scheduled_time": arrival_scheduled_time,
            "arrival_estimated_time": arrival_estimated_time,
            "arrival_actual_time": arrival_actual_time,
            "arrival_estimated_runway": arrival_estimated_runway,
            "arrival_actual_runway": arrival_actual_runway,
            "airline_name": airline_name,
            "airline_iata_code": airline_iata_code,
            "airline_icao_code": airline_icao_code,
            "flight_number": flight_number,
            "flight_iataNumber": flight_iataNumber,
            "flight_icaoNumber": flight_icaoNumber,
            "code_shared_flag": code_shared_flag,
            "code_shared_airline": code_shared_airline,
        }
        records.append(record)
    df = pd.DataFrame(records)
    return df

## Converting to a dataframe

In [6]:
folder_path = "Train"
parsed_data = process_folder(folder_path)

Processing file: 13.docx
Processing file: 18.docx
Processing file: 23.docx
Processing file: 33.docx
Processing file: 44.docx
Processing file: 42.docx
Processing file: 36.docx
Processing file: 63.docx
Processing file: 40.docx
Processing file: 20.docx
Processing file: 53.docx
Processing file: 10.docx
Processing file: 65.docx
Processing file: 51.docx
Processing file: 4.docx
Processing file: 9.docx
Processing file: 25.docx
Processing file: 17.docx
Processing file: 7.docx
Processing file: 62.docx
Processing file: 12.docx
Processing file: 8.docx
Processing file: 54.docx
Processing file: 55.docx
Processing file: 14.docx
Processing file: 2.docx
Processing file: 32.docx
Processing file: 49.docx
Processing file: 39.docx
Processing file: 67.docx
Processing file: 1.docx
Processing file: 45.docx
Processing file: 30.docx
Processing file: 46.docx
Processing file: 56.docx
Processing file: 26.docx
Processing file: 48.docx
Processing file: 47.docx
Processing file: 52.docx
Processing file: 19.docx
Proces

In [127]:
df = process_parsed_data(parsed_data=parsed_data)

In [128]:
df

Unnamed: 0,type,status,departure_iata_code,departure_icao_code,departure_terminal,departure_gate,departure_scheduled_time,departure_estimated_time,departure_actual_time,departure_estimated_runway,...,arrival_estimated_runway,arrival_actual_runway,airline_name,airline_iata_code,airline_icao_code,flight_number,flight_iataNumber,flight_icaoNumber,code_shared_flag,code_shared_airline
0,departure,active,lhe,opla,m,,2024-01-31t13:00:00.000,2024-01-31t13:00:00.000,2024-01-31t13:16:00.000,2024-01-31t13:16:00.000,...,,,airblue,pa,abq,410,pa410,abq410,0,
1,departure,active,lhe,opla,,,2024-01-25t15:05:00.000,2024-01-25t15:05:00.000,,,...,,,flyjinnah,9p,fjl,843,9p843,fjl843,0,
2,departure,active,lhe,opla,m,,2024-01-27t15:40:00.000,2024-01-27t15:46:00.000,2024-01-27t15:41:00.000,2024-01-27t15:41:00.000,...,,,flynas,xy,kne,318,xy318,kne318,0,
3,departure,active,lhe,opla,,,2024-01-19t15:05:00.000,2024-01-19t15:05:00.000,,,...,,,flyjinnah,9p,fjl,843,9p843,fjl843,0,
4,departure,active,lhe,opla,m,,2024-01-25t02:55:00.000,2024-01-25t02:55:00.000,2024-01-25t03:06:00.000,2024-01-25t03:06:00.000,...,,,american airlines,aa,aal,8284,aa8284,aal8284,1,qatar airways
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51567,departure,active,isb,opis,,,2024-01-25t03:10:00.000,2024-01-25t03:19:00.000,,,...,,,jetblue airways,b6,jbu,5557,b65557,jbu5557,1,qatar airways
51568,departure,active,isb,opis,,,2024-01-16t13:40:00.000,2024-01-16t19:25:00.000,,,...,,,serene air,er,sep,807,er807,sep807,0,
51569,departure,active,isb,opis,,,2024-01-18t22:15:00.000,2024-01-18t22:15:00.000,2024-01-18t22:30:00.000,2024-01-18t22:30:00.000,...,,,pakistan international airlines,pk,pia,161,pk161,pia161,0,
51570,departure,active,isb,opis,,,2024-01-22t13:00:00.000,2024-01-22t13:35:00.000,,,...,,,airblue,pa,abq,216,pa216,abq216,0,


## Storing as a csv

In [None]:
df.to_csv('output.csv',index=False)