<a href="https://colab.research.google.com/github/alirezashirmarz/XR-AR_NTC/blob/main/4_Read%26Extract_Features_CSV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Title: Extract Features IPI, IFI, FS from CSV File (Paper Dataset)**
#### e.g. [36] Andreas Traßl, Nick Schwarzenberg, Philipp Schulz.(2021). Augmented Reality Streams for Cloud-Based Rendering.IEEE Dataport. https://dx.doi.org/10.21227/jjan-tj96

##### It can be used for all csv saved with wireshark as csv

# 1- **Set the Rootdirectory to extract the features** (It must be done before Cell 1)

In [None]:
root_directory = r'/home/alireza/Downloads/DS/Others/CloudGaming/MyCSV'

print(root_directory)

#print(files_with_extension[0].split('.')[-5].split(r"""\""")[-1])

/home/alireza/Downloads/DS/Others/CloudGaming/MyCSV


In [None]:
# Find all pcap files in  a directory
import os
import glob

def find_files_with_extension(root_dir, extension):
  ''' Function: Find the files with sxpecific extension
  Output: list of files full address '''
    file_list = []
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith(extension):
                file_list.append(os.path.join(root, file))
    return file_list

# set the extension to find from the rood directory
extension = '.csv'  # '.pcap' or '.pcapng'
files_with_extension = find_files_with_extension(root_directory, extension)

# print to show the found files
print("Files found:")
for file_path in files_with_extension:
    print(file_path)


# 2- **Execute the extract Files**

In [None]:
import pandas as pd
import re
from ipaddress import ip_address

def extract_info(info):
    match = re.search(r'(\d+)\s+>\s+(\d+)\s+Len=(\d+)', info)
    if match:
        return int(match.group(1)), int(match.group(2)), int(match.group(3))
    return None, None, None  # Return None if the pattern does not match

'''#Extract SrcPort, DstPort, and Pkt payload Size  from the Info field '''

# Function to load and process the CSV file
def process_packet_data(file_path):
  '''Load and analyze to extract the features'''

    # Load the CSV file
    data = pd.read_csv(file_path)

    # Extracting SrcPort, DstPort, and PS from the Info field
    extracted_info = data['Info'].apply(lambda x: pd.Series(extract_info(x)))
    data[['SrcPort', 'DstPort', 'PS']] = extracted_info.fillna(0).astype(int)
    #info_pattern = re.compile(r'(\d+)\s+>\s+(\d+)\s+Len=(\d+)')
    #data[['SrcPort', 'DstPort', 'PS']] = data['Info'].str.extract(info_pattern).astype(int)

    # Calculating IPI (Inter Packet Interval)
    data['IPI'] = data['Time'].diff().fillna(0)

    # Identifying unique flows and frames
    data['FlowID'] = data[['Source', 'Destination', 'Protocol']].apply(lambda x: '-'.join(x), axis=1)
    data['FrameID'] = data[['Source', 'Destination', 'SrcPort', 'DstPort', 'Protocol']].apply(lambda x: '-'.join(map(str, x)), axis=1)

    # Calculating FlowSizeBytes, FlowSizePackets, FS, FS(PKT), and IFI
    data['FlowSizeBytes'] = data.groupby('FlowID')['Length'].transform('sum')
    data['FlowSizePackets'] = data.groupby('FlowID')['Length'].transform('size')
    data['FS'] = data.groupby('FrameID')['Length'].transform('sum')
    data['FS(PKT)'] = data.groupby('FrameID')['Length'].transform('size')
    data['IFI'] = data.groupby('FrameID')['Time'].diff().fillna(0).cumsum()

    # Number of unique frames in the dataset
    data['NumFrames'] = data['FrameID'].nunique()

    # Selecting and renaming the columns for final output
    final_columns = ['No.', 'Source', 'Destination', 'SrcPort', 'DstPort', 'Protocol', 'PS', 'IPI',
                     'FlowSizeBytes', 'FlowSizePackets', 'FS', 'FS(PKT)', 'NumFrames', 'IFI']
    final_data = data[final_columns]
    final_data.columns = ['ID', 'SrcIP', 'DstIP', 'SrcPort', 'DstPort', 'Protocol', 'PS', 'IPI',
                          'FlowSizeBytes', 'FlowSizePackets', 'FS', 'FS(PKT)', 'NumFrames', 'IFI']

    return final_data

i=0
# Extracted features and reaported
for file_path in files_with_extension:
    i=i+1
    output_csv_file_path= file_path + 'my' + str(i) + '.csv'
    processed_data = process_packet_data(file_path)
    processed_data.to_csv(output_csv_file_path)
    print(str(i) , output_csv_file_path)

# 3- **Merge the Extracted CSV files**

In [None]:
# Read and merge the csv files

'''
# Configuration
    # Input:
      1- root-directory--> It is the directory with csv files (In this case, it
      uses the roo directory from 2 previous cell)
      2- ds_name --> the name you want to save the csv file (without extension)
    # Output:
      1- Save the
'''

import pandas as pd
#root_directory = "E:\Postdoc_UFScar\Dataset\Other Datasets\VOD"
ds_name= 'DS'
print(root_directory)
csvfiles = find_files_with_extension(root_directory, 'csv')
dff = []
for mycsv in csvfiles:
    mydf = pd.read_csv(mycsv)
    dff.append(mydf)
df = pd.concat(dff, ignore_index=True, sort=False)
#df = pd.concat(dff, ignore_index=True)
df.to_csv(root_directory +'/' + ds_name +'.csv', index=False)
#df.to_csv(root_directory + '\\AR.csv', index=False)
print(f"The df with {df.shape[0]} rows and {df.shape[1]} columns was stored as csv file!")