-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
163 additions
and
119 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,49 +1,49 @@ | ||
"""DataFrames cleaning module.""" | ||
import pandas as pd | ||
|
||
|
||
def duplicated_rows( | ||
data_frame: pd.DataFrame, display_summary: bool = True, drop: bool = False | ||
) -> pd.DataFrame: | ||
"""Get duplicated rows in a DataFrame and drop them if specified. | ||
Args: | ||
data_frame (pd.DataFrame): DataFrame to get duplicated rows. | ||
display_summary (bool, optional): Whether to display summary. Defaults to True. | ||
drop (bool, optional): Whether to drop duplicated rows. Defaults to True. | ||
Returns: | ||
pd.DataFrame: Duplicated rows. | ||
""" | ||
duplicated_data_frame: pd.DataFrame = data_frame[data_frame.duplicated(keep=False)] | ||
if display_summary: | ||
print(f"Number of duplicated rows: {duplicated_data_frame.shape[0]}") | ||
if drop: | ||
data_frame.drop_duplicates(inplace=True) | ||
if display_summary and drop: | ||
print("DataFrame shape after dropping duplicated rows:") | ||
print(data_frame.shape) | ||
return duplicated_data_frame | ||
|
||
|
||
def duplicated_columns( | ||
data_frame: pd.DataFrame, display_summary: bool = True, drop: bool = False | ||
) -> pd.DataFrame: | ||
"""Get duplicated columns in a DataFrame and drop them if specified. | ||
Args: | ||
data_frame (pd.DataFrame): DataFrame to get duplicated columns. | ||
display_summary (bool, optional): Whether to display summary. Defaults to True. | ||
drop (bool, optional): Whether to drop duplicated columns. Defaults to True. | ||
Returns: | ||
pd.DataFrame: Duplicated columns. | ||
""" | ||
# uniques, indexes = np.unique(data_frame, return_index=True, axis=1) | ||
# return pd.DataFrame(uniq, index=data_frame.index, columns=data_frame.columns[~indexes]) | ||
duplicates: pd.Series = data_frame.apply(lambda x: x.duplicated(), axis=1).all() | ||
duplicated_data_frame: pd.DataFrame = data_frame[duplicates[duplicates].index] | ||
if display_summary: | ||
print(f"Number of duplicated columns: {duplicated_data_frame.shape[1]}") | ||
if drop: | ||
data_frame.drop(columns=duplicated_data_frame.columns, inplace=True) | ||
if display_summary and drop: | ||
print("DataFrame shape after dropping duplicated columns:") | ||
print(data_frame.shape) | ||
return duplicated_data_frame | ||
"""DataFrames cleaning module.""" | ||
import pandas as pd | ||
|
||
|
||
def duplicated_rows( | ||
data_frame: pd.DataFrame, display_summary: bool = True, drop: bool = False | ||
) -> pd.DataFrame: | ||
"""Get duplicated rows in a DataFrame and drop them if specified. | ||
Args: | ||
data_frame (pd.DataFrame): DataFrame to get duplicated rows. | ||
display_summary (bool, optional): Whether to display summary. Defaults to True. | ||
drop (bool, optional): Whether to drop duplicated rows. Defaults to True. | ||
Returns: | ||
pd.DataFrame: Duplicated rows. | ||
""" | ||
duplicated_data_frame: pd.DataFrame = data_frame[data_frame.duplicated(keep=False)] | ||
if display_summary: | ||
print(f"Number of duplicated rows: {duplicated_data_frame.shape[0]}") | ||
if drop: | ||
data_frame.drop_duplicates(inplace=True) | ||
if display_summary and drop: | ||
print("DataFrame shape after dropping duplicated rows:") | ||
print(data_frame.shape) | ||
return duplicated_data_frame | ||
|
||
|
||
def duplicated_columns( | ||
data_frame: pd.DataFrame, display_summary: bool = True, drop: bool = False | ||
) -> pd.DataFrame: | ||
"""Get duplicated columns in a DataFrame and drop them if specified. | ||
Args: | ||
data_frame (pd.DataFrame): DataFrame to get duplicated columns. | ||
display_summary (bool, optional): Whether to display summary. Defaults to True. | ||
drop (bool, optional): Whether to drop duplicated columns. Defaults to True. | ||
Returns: | ||
pd.DataFrame: Duplicated columns. | ||
""" | ||
# uniques, indexes = np.unique(data_frame, return_index=True, axis=1) | ||
# return pd.DataFrame(uniq, index=data_frame.index, columns=data_frame.columns[~indexes]) | ||
duplicates: pd.Series = data_frame.apply(lambda x: x.duplicated(), axis=1).all() | ||
duplicated_data_frame: pd.DataFrame = data_frame[duplicates[duplicates].index] | ||
if display_summary: | ||
print(f"Number of duplicated columns: {duplicated_data_frame.shape[1]}") | ||
if drop: | ||
data_frame.drop(columns=duplicated_data_frame.columns, inplace=True) | ||
if display_summary and drop: | ||
print("DataFrame shape after dropping duplicated columns:") | ||
print(data_frame.shape) | ||
return duplicated_data_frame |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
"""Lineplot module.""" | ||
import pandas as pd | ||
import plotly.express as px | ||
import plotly.graph_objects as go | ||
|
||
from .style import custom_template, format_title | ||
|
||
|
||
def lineplot( | ||
data_frame: pd.DataFrame, | ||
x_axis: str, | ||
y_axis: str, | ||
title: str, | ||
subtitle: str = None, | ||
**kwargs, | ||
) -> go.Figure: | ||
"""Create a bar plot. | ||
Args: | ||
data_frame (pd.DataFrame): DataFrame to plot. | ||
x_axis (str): Column to use as x axis. | ||
y_axis (str): Column to use as y axis. | ||
title (str): Title of the plot. | ||
subtitle (str, optional): Subtitle of the plot. Defaults to None. | ||
**kwargs: Keyword arguments to pass to plotly.express.bar. | ||
Returns: | ||
go.Figure: Bar plot figure. | ||
""" | ||
fig = px.line( | ||
data_frame, | ||
x=x_axis, | ||
y=y_axis, | ||
title=format_title(title, subtitle=subtitle), | ||
template=custom_template, | ||
width=800, | ||
height=400, | ||
**kwargs, | ||
) | ||
return fig |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,12 @@ | ||
pandas==1.5.1 | ||
scipy==1.9.3 | ||
scikit-learn==1.1.3 | ||
plotly==5.11.0 | ||
nbformat==5.7.0 | ||
ipython==8.6.0 | ||
openrouteservice==2.3.3 | ||
pytest-cov==4.0.0 | ||
black==22.10.0 | ||
pylint==2.15.5 | ||
isort==5.10.1 | ||
pre-commit==2.20.0 | ||
pandas>=1.5.1 | ||
scipy>=1.9.3 | ||
scikit-learn>=1.1.3 | ||
plotly>=5.11.0 | ||
nbformat>=5.7.0 | ||
ipython>=8.6.0 | ||
openrouteservice>=2.3.3 | ||
pytest-cov>=4.0.0 | ||
black>=22.10.0 | ||
pylint>=2.15.5 | ||
isort>=5.12.0 | ||
pre-commit>=2.20.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,56 +1,56 @@ | ||
"""Test duplicates.py functions.""" | ||
from unittest.mock import MagicMock, patch | ||
|
||
import pandas as pd | ||
from numpy.testing import assert_equal | ||
|
||
from coolpandas import eda | ||
|
||
|
||
@patch("builtins.print") | ||
def test_duplicated_rows(mock_print: MagicMock, test_dataframe: pd.DataFrame) -> None: | ||
"""Test duplicated_rows function.""" | ||
duplicated_rows: pd.DataFrame = eda.duplicated_rows( | ||
test_dataframe, display_summary=False, drop=False | ||
) | ||
assert duplicated_rows.empty | ||
|
||
test_dataframe_duplicated: pd.DataFrame = pd.concat( | ||
[test_dataframe, test_dataframe.head()] | ||
) | ||
duplicated_rows: pd.DataFrame = eda.duplicated_rows( | ||
test_dataframe_duplicated, display_summary=False, drop=False | ||
) | ||
assert len(duplicated_rows) == 8 | ||
|
||
eda.duplicated_rows(test_dataframe_duplicated, display_summary=True, drop=False) | ||
eda.duplicated_rows(test_dataframe_duplicated, display_summary=True, drop=True) | ||
assert_equal( | ||
test_dataframe_duplicated.fillna(0).values, test_dataframe.fillna(0).values | ||
) | ||
assert mock_print.call_count == 4 | ||
|
||
|
||
@patch("builtins.print") | ||
def test_duplicated_columns( | ||
mock_print: MagicMock, test_dataframe: pd.DataFrame | ||
) -> None: | ||
"""Test duplicated_columns function.""" | ||
duplicated_columns: list[str] = eda.duplicated_columns( | ||
test_dataframe, display_summary=False, drop=False | ||
) | ||
assert duplicated_columns.empty | ||
|
||
test_dataframe_duplicated: pd.DataFrame = test_dataframe.copy() | ||
test_dataframe_duplicated["test"] = test_dataframe_duplicated["Animal"] | ||
duplicated_columns: list[str] = eda.duplicated_columns( | ||
test_dataframe_duplicated, display_summary=False, drop=False | ||
) | ||
assert_equal(duplicated_columns.columns.values, ["test"]) | ||
|
||
eda.duplicated_columns(test_dataframe_duplicated, display_summary=True, drop=False) | ||
eda.duplicated_columns(test_dataframe_duplicated, display_summary=True, drop=True) | ||
assert_equal( | ||
test_dataframe_duplicated.columns.values, test_dataframe.columns.values | ||
) | ||
assert mock_print.call_count == 4 | ||
"""Test duplicates.py functions.""" | ||
from unittest.mock import MagicMock, patch | ||
|
||
import pandas as pd | ||
from numpy.testing import assert_equal | ||
|
||
from coolpandas import eda | ||
|
||
|
||
@patch("builtins.print") | ||
def test_duplicated_rows(mock_print: MagicMock, test_dataframe: pd.DataFrame) -> None: | ||
"""Test duplicated_rows function.""" | ||
duplicated_rows: pd.DataFrame = eda.duplicated_rows( | ||
test_dataframe, display_summary=False, drop=False | ||
) | ||
assert duplicated_rows.empty | ||
|
||
test_dataframe_duplicated: pd.DataFrame = pd.concat( | ||
[test_dataframe, test_dataframe.head()] | ||
) | ||
duplicated_rows: pd.DataFrame = eda.duplicated_rows( | ||
test_dataframe_duplicated, display_summary=False, drop=False | ||
) | ||
assert len(duplicated_rows) == 8 | ||
|
||
eda.duplicated_rows(test_dataframe_duplicated, display_summary=True, drop=False) | ||
eda.duplicated_rows(test_dataframe_duplicated, display_summary=True, drop=True) | ||
assert_equal( | ||
test_dataframe_duplicated.fillna(0).values, test_dataframe.fillna(0).values | ||
) | ||
assert mock_print.call_count == 4 | ||
|
||
|
||
@patch("builtins.print") | ||
def test_duplicated_columns( | ||
mock_print: MagicMock, test_dataframe: pd.DataFrame | ||
) -> None: | ||
"""Test duplicated_columns function.""" | ||
duplicated_columns: list[str] = eda.duplicated_columns( | ||
test_dataframe, display_summary=False, drop=False | ||
) | ||
assert duplicated_columns.empty | ||
|
||
test_dataframe_duplicated: pd.DataFrame = test_dataframe.copy() | ||
test_dataframe_duplicated["test"] = test_dataframe_duplicated["Animal"] | ||
duplicated_columns: list[str] = eda.duplicated_columns( | ||
test_dataframe_duplicated, display_summary=False, drop=False | ||
) | ||
assert_equal(duplicated_columns.columns.values, ["test"]) | ||
|
||
eda.duplicated_columns(test_dataframe_duplicated, display_summary=True, drop=False) | ||
eda.duplicated_columns(test_dataframe_duplicated, display_summary=True, drop=True) | ||
assert_equal( | ||
test_dataframe_duplicated.columns.values, test_dataframe.columns.values | ||
) | ||
assert mock_print.call_count == 4 |