In [None]:
%run "utilspro.py"


In [None]:
# set truncation threshold to display all rows
pd.set_option('display.max_columns', None)
# Loading the dataset
data = pd.read_csv(dataset_path)

# Displaying the first few rows of the dataset
data.head()

In [None]:
# diplay the info of the dataset and the number of rows and columns and the nulls per columns in same functio
# Extracting modes
modes = data.mode()

# Helper function to get the three most common
def three_most_common(lst):
    count = Counter(lst)
    most_common = count.most_common(3)
    return [item[0] for item in most_common]



# Creating an information DataFrame
info_df = pd.DataFrame({
    'Column': data.columns,
    'Dtype': data.dtypes.values,
    'Null Count': data.isnull().sum().values,
    'Non-Null Count': data.count().values,
    'Unique Count': data.nunique().values,
    '% Missing': (data.isnull().sum() / len(data) * 100).values,
    '1st Mode': [three_most_common(data[col])[0] if len(three_most_common(data[col])) > 0 else None for col in data.columns],
    '2nd Mode': [three_most_common(data[col])[1] if len(three_most_common(data[col])) > 1 else None for col in data.columns],
    '3rd Mode': [three_most_common(data[col])[2] if len(three_most_common(data[col])) > 2 else None for col in data.columns],
    'Skewness': data.skew().values,
    'Kurtosis': data.kurt().values,
})

info_df


In [None]:
subdata

comments:


## Data structure and summary

In [None]:
# Checking the data structure (data types and missing values)
data_info = data.info()

# Checking for placeholder values (assuming '?' is a placeholder)
placeholder_counts = data.apply(lambda x: x[x == '?'].count())

# Getting statistical summary of the dataset for numerical columns
data_summary = data.describe()

data_info, placeholder_counts, data_summary


* #### Data Columns and Types:
The dataset has 141,712 entries and 36 columns.
Data types include objects (strings), integers, and booleans.
Several columns like opened_at, resolved_at, and closed_at are of object type (strings) representing dates. We may need to convert these to datetime format for further analysis.

* #### Placeholder Values (?):
Several columns contain the placeholder value ?. For instance, the cmdb_ci column has 141,267 such values, which is a significant portion of the total entries.
Other columns with a high count of placeholders include sys_created_by, u_symptom, problem_id, rfc, vendor, and caused_by.

* ### Statistical Summary:
reassignment_count ranges from 0 to 27, with an average of around 1.1.
reopen_count has a maximum value of 8, but 75% of the data is still 0, indicating that most incidents are not reopened.
sys_mod_count (probably indicating system modifications) has a wide range, with values from 0 to 129 and an average of around 5.

#### Granularity and uniqueness


In [None]:
# Checking the uniqueness of the 'number' column
unique_incidents = data['number'].nunique()
total_entries = len(data)

unique_incidents, total_entries

* 24,918 unique incidents (as indicated by the number column).
* 141,712 total entries.

This suggests that the dataset is not at the granularity of individual incidents. Instead, each incident has multiple entries, possibly capturing different states or updates related to the incident over time.

### Data distribution

We'll focus on the following columns:

 * Numerical: reassignment_count, reopen_count, and sys_mod_count.
* Categorical: incident_state, contact_type, and priority.

In [None]:
# Setting up the plotting environment
sns.set_style("whitegrid")
plt.figure(figsize=(15, 10))

# Plotting the distribution of the numerical columns
numerical_columns = ['reassignment_count', 'reopen_count', 'sys_mod_count']

for i, column in enumerate(numerical_columns, 1):
    plt.subplot(2, 3, i)
    sns.histplot(data[column], kde=True, bins=30)
    plt.title(f'Distribution of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

* ### Reassignment Count:
 Most incidents are reassigned once or not at all, with a sharp decline in frequency as the number of reassignments increases.
* ### Reopen Count:
The vast majority of incidents are not reopened. Only a small number of incidents have been reopened multiple times.
* ### Sys Mod Count (System Modification Count):
 Most incidents have a system modification count between 0 and 10. However, there's a long tail, indicating that some incidents have been modified many time

 let's visualize the distribution of the selected categorical columns: incident_state, contact_type, and priority.

In [None]:
# Plotting the distribution of the categorical columns
categorical_columns = ['incident_state', 'contact_type', 'priority']

plt.figure(figsize=(15, 10))

for i, column in enumerate(categorical_columns, 1):
    plt.subplot(2, 3, i)
    sns.countplot(data=data, y=column, order=data[column].value_counts().index)
    plt.title(f'Distribution of {column}')
    plt.xlabel('Frequency')
    plt.ylabel(column)

plt.tight_layout()
plt.show()

* ### Incident State:
The most common state is "Active", followed by "New" and "Resolved".
States like "Awaiting Problem", "Awaiting Vendor", and "Awaiting Evidence" have considerably fewer occurrences.
* ### Contact Type:
The vast majority of incidents are reported via "Phone", with very few incidents reported through "Email", "Self service", or "Direct opening".
* ### Priority:
The "3 - Moderate" priority level is the most common, followed by "2 - High" and "4 - Low". There are fewer incidents with "1 - Critical" and "5 - Very Low" priorities.

### Correlation analysis

In [None]:
# Correlation matrix for numerical columns
correlation_matrix = data[numerical_columns].corr()

# Plotting the correlation matrix using a heatmap
plt.figure(figsize=(10, 7))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5, vmin=-1, vmax=1)
plt.title('Correlation Matrix of Numerical Variables')
plt.show()

* ### Reassignment Count and Sys Mod Count: 
There's a light positive correlation (approximately 0.53) between these two variables. This indicates that as incidents are reassigned more often, the number of system modifications also tends to increase.
* ### Reopen Count:
This variable doesn't have a strong correlation with the other two. Its correlation with both reassignment_count and sys_mod_count is relatively low.

### Anomaly Detection

We'll focus on identifying potential outliers within the numerical columns. Outliers can distort the results of our analyses and models.

In [None]:
# Plotting boxplots for anomaly detection in numerical columns
plt.figure(figsize=(15, 7))

for i, column in enumerate(numerical_columns, 1):
    plt.subplot(1, 3, i)
    sns.boxplot(data[column])
    plt.title(f'Box Plot of {column}')
    plt.xlabel(column)

plt.tight_layout()
plt.show()

* #### Reassignment Count:
Most of the data points lie between 0 and 2. However, there are several outliers beyond this range, with some incidents having been reassigned more than 20 times.
* #### Reopen Count:
While the majority of incidents are not reopened, there are outliers where incidents have been reopened multiple times.
* #### Sys Mod Count:
Most of the incidents have undergone system modifications less than 15 times. Yet, there are outliers, with some incidents having more than 40 system modifications.

## Data Consistency 
we'll focus on checking if:
* There are incidents with a closed_at date earlier than the opened_at date.
* Identifying any incidents that are marked as active but have a closed_at date.

In [None]:
# Converting 'opened_at', 'closed_at' to datetime format
data['opened_at'] = pd.to_datetime(data['opened_at'], errors='coerce', dayfirst=True)
data['closed_at'] = pd.to_datetime(data['closed_at'], errors='coerce', dayfirst=True)

# Checking for incidents with 'closed_at' date earlier than 'opened_at' date
inconsistent_dates = data[data['closed_at'] < data['opened_at']]

# Checking for incidents that are marked as 'active' but have a 'closed_at' date
active_but_closed = data[(data['active'] == True) & (~data['closed_at'].isna())]

inconsistent_dates_count = len(inconsistent_dates)
active_but_closed_count = len(active_but_closed)

inconsistent_dates_count, active_but_closed_count

* #### There are no incidents with a closed_at date earlier than the opened_at date.
 This is good as it indicates consistent date information.
* #### There are 116,726 incidents that are marked as active but have a closed_at date.
 This is peculiar and suggests potential inconsistencies in the data. Incidents that are still active shouldn't have a closure date.

NB :  It's crucial to address identified inconsistencies and anomalies before proceeding with  building machine learning models.