In [1]:
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider

# Authentication setup (if needed)
auth_provider = PlainTextAuthProvider(username='your_username', password='your_password')

# Connecting to the cluster
cluster = Cluster(contact_points=['127.0.0.1'], port=9042)
session = cluster.connect()


In [4]:
session.execute("""
    CREATE KEYSPACE IF NOT EXISTS your_keyspace
    WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}
""")

session.execute("""
 CREATE TABLE IF NOT EXISTS your_keyspace.your_table (
    id int PRIMARY KEY,
    Gender text,
    Customer_Type text,
    Age int,
    Type_of_Travel text,
    Class text,
    Flight_Distance int,
    Inflight_wifi_service int,
    Departure_Arrival_time_convenient int,
    Ease_of_Online_booking int,
    Gate_location int,
    Food_and_drink int,
    Online_boarding int,
    Seat_comfort int,
    Inflight_entertainment int,
    On_board_service int,
    Leg_room_service int,
    Baggage_handling int,
    Checkin_service int,
    Inflight_service int,
    Cleanliness int,
    Departure_Delay_in_Minutes int,
    Arrival_Delay_in_Minutes float,  -- Assuming possible decimals
    satisfaction text
    -- Add more columns as per your CSV structure
);

""")


<cassandra.cluster.ResultSet at 0x77cdbdf24730>

In [7]:
insert_query = """
    INSERT INTO your_keyspace.your_table (id, Gender, Customer_Type, Age, Type_of_Travel, Class, Flight_Distance, Inflight_wifi_service, Departure_Arrival_time_convenient, Ease_of_Online_booking, Gate_location, Food_and_drink, Online_boarding, Seat_comfort, Inflight_entertainment, On_board_service, Leg_room_service, Baggage_handling, Checkin_service, Inflight_service, Cleanliness, Departure_Delay_in_Minutes, Arrival_Delay_in_Minutes, satisfaction)
    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
"""


In [29]:
import pandas as pd
from cassandra.cluster import Cluster
from cassandra.query import BatchStatement

# Assuming you've already connected to Cassandra
# cluster and session are your Cluster and Session instances

# Read the CSV file
csv_path = 'data/test.csv'
df = pd.read_csv(csv_path)

print(df.columns)
df.head()

Index(['Unnamed: 0', 'id', 'Gender', 'Customer Type', 'Age', 'Type of Travel',
       'Class', 'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,19556,Female,Loyal Customer,52,Business travel,Eco,160,5,4,...,5,5,5,5,2,5,5,50,44.0,satisfied
1,1,90035,Female,Loyal Customer,36,Business travel,Business,2863,1,1,...,4,4,4,4,3,4,5,0,0.0,satisfied
2,2,12360,Male,disloyal Customer,20,Business travel,Eco,192,2,0,...,2,4,1,3,2,2,2,0,0.0,neutral or dissatisfied
3,3,77959,Male,Loyal Customer,44,Business travel,Business,3377,0,0,...,1,1,1,1,3,1,4,0,6.0,satisfied
4,4,36875,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,...,2,2,2,2,4,2,4,0,20.0,satisfied


In [11]:


prepared_statement = session.prepare(insert_query)

# Adjust the column names to match those in your DataFrame, excluding any index or unwanted columns
columns_to_insert = [
    'id', 'Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class', 'Flight Distance',
    'Inflight wifi service', 'Departure/Arrival time convenient', 'Ease of Online booking',
    'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
    'Inflight entertainment', 'On-board service', 'Leg room service', 'Baggage handling',
    'Checkin service', 'Inflight service', 'Cleanliness', 'Departure Delay in Minutes',
    'Arrival Delay in Minutes', 'satisfaction'
]


for _, row in df.iterrows():
    # Create a data tuple that only includes the columns you're intending to insert
    data_tuple = tuple(row[col] for col in columns_to_insert)
    # Convert NaN to None to handle missing values appropriately
    data_tuple = tuple(None if pd.isna(value) else value for value in data_tuple)
    # Execute the insert statement for each row
    session.execute(prepared_statement, data_tuple)


In [12]:
# Prepare the SELECT statement
select_query = "SELECT * FROM your_keyspace.your_table LIMIT 5;"

# Execute the query
rows = session.execute(select_query)

# Iterate over the result set and print each row
for row in rows:
    print(row)


Row(id=121478, age=31, arrival_delay_in_minutes=0.0, baggage_handling=1, checkin_service=5, field_5_='Eco', cleanliness=5, customer_type='Loyal Customer', departure_arrival_time_convenient=3, departure_delay_in_minutes=0, ease_of_online_booking=3, flight_distance=331, food_and_drink=5, gate_location=2, gender='Male', inflight_entertainment=5, inflight_service=5, inflight_wifi_service=3, leg_room_service=1, on_board_service=5, online_boarding=3, satisfaction='neutral or dissatisfied', seat_comfort=5, type_of_travel='Personal Travel')
Row(id=122937, age=26, arrival_delay_in_minutes=25.0, baggage_handling=4, checkin_service=4, field_5_='Business', cleanliness=4, customer_type='Loyal Customer', departure_arrival_time_convenient=5, departure_delay_in_minutes=20, ease_of_online_booking=4, flight_distance=3691, food_and_drink=4, gate_location=4, gender='Female', inflight_entertainment=4, inflight_service=4, inflight_wifi_service=4, leg_room_service=2, on_board_service=5, online_boarding=4, sa

In [24]:
from pymongo import MongoClient

# Connect to MongoDB
mongo_client = MongoClient('mongodb://localhost:27017/')
mongo_db = mongo_client['your_database']
mongo_collection = mongo_db['your_collection']


In [25]:
select_query = "SELECT * FROM your_keyspace.your_table;"
rows = session.execute(select_query)


In [26]:
batch_size = 1000  # Define a suitable batch size
batch = []  # Initialize a batch

for row in rows:
    # Convert the Cassandra row to a dict and add to the batch
    batch.append({column: value for column, value in row._asdict().items()})
    
    # When the batch reaches the specified size, insert it and start a new batch
    if len(batch) >= batch_size:
        mongo_collection.insert_many(batch)
        batch = []  # Reset the batch

# Insert any remaining documents in the last batch
if batch:
    mongo_collection.insert_many(batch)


In [27]:
# Assuming mongo_collection is your MongoDB collection
documents = mongo_collection.find().limit(5)

for document in documents:
    print(document)


{'_id': ObjectId('65f6197938b3d9a9d355b69a'), 'id': 121478, 'age': 31, 'arrival_delay_in_minutes': 0.0, 'baggage_handling': 1, 'checkin_service': 5, 'field_5_': 'Eco', 'cleanliness': 5, 'customer_type': 'Loyal Customer', 'departure_arrival_time_convenient': 3, 'departure_delay_in_minutes': 0, 'ease_of_online_booking': 3, 'flight_distance': 331, 'food_and_drink': 5, 'gate_location': 2, 'gender': 'Male', 'inflight_entertainment': 5, 'inflight_service': 5, 'inflight_wifi_service': 3, 'leg_room_service': 1, 'on_board_service': 5, 'online_boarding': 3, 'satisfaction': 'neutral or dissatisfied', 'seat_comfort': 5, 'type_of_travel': 'Personal Travel'}
{'_id': ObjectId('65f6197938b3d9a9d355b69b'), 'id': 122937, 'age': 26, 'arrival_delay_in_minutes': 25.0, 'baggage_handling': 4, 'checkin_service': 4, 'field_5_': 'Business', 'cleanliness': 4, 'customer_type': 'Loyal Customer', 'departure_arrival_time_convenient': 5, 'departure_delay_in_minutes': 20, 'ease_of_online_booking': 4, 'flight_distance'

In [19]:
documents = mongo_collection.find().sort("created_at", 1).limit(5)


<pymongo.cursor.Cursor at 0x77cd9bd833d0>